Merge branch 'main' into vlad/get-vectored-read-path

2026-05-24 08:30:37 +00:00 · 2024-02-15 18:33:17 +00:00
parent 8b44003647 046d9c69e6
commit bf3240ccae
237 changed files with 13135 additions and 3249 deletions
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -272,6 +272,12 @@ fn start_pageserver(
    );
    set_build_info_metric(GIT_VERSION, BUILD_TAG);
    set_launch_timestamp_metric(launch_ts);
+    #[cfg(target_os = "linux")]
+    metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap();
+    metrics::register_internal(Box::new(
+        pageserver::metrics::tokio_epoll_uring::Collector::new(),
+    ))
+    .unwrap();
    pageserver::preinitialize_metrics();

    // If any failpoints were set from FAILPOINTS environment variable,
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1383,6 +1383,7 @@ broker_endpoint = '{broker_endpoint}'
                parsed_remote_storage_config,
                RemoteStorageConfig {
                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
+                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                },
                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
            );
@@ -1450,6 +1451,7 @@ broker_endpoint = '{broker_endpoint}'
                        concurrency_limit: s3_concurrency_limit,
                        max_keys_per_list_response: None,
                    }),
+                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                },
                "Remote storage config should correctly parse the S3 config"
            );
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -234,7 +234,7 @@ impl DeletionHeader {
        let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
        let header_path = conf.deletion_header_path();
        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
-        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
+        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, header_bytes)
            .await
            .maybe_fatal_err("save deletion header")?;

@@ -325,7 +325,7 @@ impl DeletionList {
        let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);

        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
-        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
+        VirtualFile::crashsafe_overwrite(&path, &temp_path, bytes)
            .await
            .maybe_fatal_err("save deletion list")
            .map_err(Into::into)
@@ -834,7 +834,6 @@ mod test {
    }

    impl ControlPlaneGenerationsApi for MockControlPlane {
-        #[allow(clippy::diverging_sub_expression)] // False positive via async_trait
        async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
            unimplemented!()
        }
@@ -868,6 +867,7 @@ mod test {
        let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
        let storage_config = RemoteStorageConfig {
            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+            timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
        };
        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();

@@ -1171,6 +1171,7 @@ pub(crate) mod mock {
    pub struct ConsumerState {
        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
        executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+        cancel: CancellationToken,
    }

    impl ConsumerState {
@@ -1184,7 +1185,7 @@ pub(crate) mod mock {
                match msg {
                    DeleterMessage::Delete(objects) => {
                        for path in objects {
-                            match remote_storage.delete(&path).await {
+                            match remote_storage.delete(&path, &self.cancel).await {
                                Ok(_) => {
                                    debug!("Deleted {path}");
                                }
@@ -1217,7 +1218,7 @@ pub(crate) mod mock {

                        for path in objects {
                            info!("Executing deletion {path}");
-                            match remote_storage.delete(&path).await {
+                            match remote_storage.delete(&path, &self.cancel).await {
                                Ok(_) => {
                                    debug!("Deleted {path}");
                                }
@@ -1267,7 +1268,11 @@ pub(crate) mod mock {
                executor_tx,
                executed,
                remote_storage,
-                consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }),
+                consumer: std::sync::Mutex::new(ConsumerState {
+                    rx,
+                    executor_rx,
+                    cancel: CancellationToken::new(),
+                }),
                lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
            }
        }
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -8,6 +8,7 @@

 use remote_storage::GenericRemoteStorage;
 use remote_storage::RemotePath;
+use remote_storage::TimeoutOrCancel;
 use remote_storage::MAX_KEYS_PER_DELETE;
 use std::time::Duration;
 use tokio_util::sync::CancellationToken;
@@ -71,9 +72,11 @@ impl Deleter {
                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
                });

-                self.remote_storage.delete_objects(&self.accumulator).await
+                self.remote_storage
+                    .delete_objects(&self.accumulator, &self.cancel)
+                    .await
            },
-            |_| false,
+            TimeoutOrCancel::caused_by_cancel,
            3,
            10,
            "executing deletion batch",
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -351,7 +351,6 @@ pub enum IterationOutcome<U> {
    Finished(IterationOutcomeFinished<U>),
 }

-#[allow(dead_code)]
 #[derive(Debug, Serialize)]
 pub struct IterationOutcomeFinished<U> {
    /// The actual usage observed before we started the iteration.
@@ -366,7 +365,6 @@ pub struct IterationOutcomeFinished<U> {
 }

 #[derive(Debug, Serialize)]
-#[allow(dead_code)]
 struct AssumedUsage<U> {
    /// The expected value for `after`, after phase 2.
    projected_after: U,
@@ -374,14 +372,12 @@ struct AssumedUsage<U> {
    failed: LayerCount,
 }

-#[allow(dead_code)]
 #[derive(Debug, Serialize)]
 struct PlannedUsage<U> {
    respecting_tenant_min_resident_size: U,
    fallback_to_global_lru: Option<U>,
 }

-#[allow(dead_code)]
 #[derive(Debug, Default, Serialize)]
 struct LayerCount {
    file_sizes: u64,
@@ -565,7 +561,6 @@ pub(crate) struct EvictionSecondaryLayer {
 #[derive(Clone)]
 pub(crate) enum EvictionLayer {
    Attached(Layer),
-    #[allow(dead_code)]
    Secondary(EvictionSecondaryLayer),
 }

@@ -623,6 +618,7 @@ impl std::fmt::Display for EvictionLayer {
    }
 }

+#[derive(Default)]
 pub(crate) struct DiskUsageEvictionInfo {
    /// Timeline's largest layer (remote or resident)
    pub max_layer_size: Option<u64>,
@@ -854,19 +850,27 @@ async fn collect_eviction_candidates(

        let total = tenant_candidates.len();

-        for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
-            // as we iterate this reverse sorted list, the most recently accessed layer will always
-            // be 1.0; this is for us to evict it last.
-            candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);
+        let tenant_candidates =
+            tenant_candidates
+                .into_iter()
+                .enumerate()
+                .map(|(i, mut candidate)| {
+                    // as we iterate this reverse sorted list, the most recently accessed layer will always
+                    // be 1.0; this is for us to evict it last.
+                    candidate.relative_last_activity =
+                        eviction_order.relative_last_activity(total, i);

-            let partition = if cumsum > min_resident_size as i128 {
-                MinResidentSizePartition::Above
-            } else {
-                MinResidentSizePartition::Below
-            };
-            cumsum += i128::from(candidate.layer.get_file_size());
-            candidates.push((partition, candidate));
-        }
+                    let partition = if cumsum > min_resident_size as i128 {
+                        MinResidentSizePartition::Above
+                    } else {
+                        MinResidentSizePartition::Below
+                    };
+                    cumsum += i128::from(candidate.layer.get_file_size());
+
+                    (partition, candidate)
+                });
+
+        candidates.extend(tenant_candidates);
    }

    // Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -882,21 +886,41 @@ async fn collect_eviction_candidates(
    );

    for secondary_tenant in secondary_tenants {
-        let mut layer_info = secondary_tenant.get_layers_for_eviction();
+        // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
+        // to prevent repeated disk usage based evictions from completely draining less often
+        // updating secondaries.
+        let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
+
+        debug_assert!(
+            total_layers >= layer_info.resident_layers.len(),
+            "total_layers ({total_layers}) must be at least the resident_layers.len() ({})",
+            layer_info.resident_layers.len()
+        );

        layer_info
            .resident_layers
            .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));

-        candidates.extend(layer_info.resident_layers.into_iter().map(|candidate| {
-            (
-                // Secondary locations' layers are always considered above the min resident size,
-                // i.e. secondary locations are permitted to be trimmed to zero layers if all
-                // the layers have sufficiently old access times.
-                MinResidentSizePartition::Above,
-                candidate,
-            )
-        }));
+        let tenant_candidates =
+            layer_info
+                .resident_layers
+                .into_iter()
+                .enumerate()
+                .map(|(i, mut candidate)| {
+                    candidate.relative_last_activity =
+                        eviction_order.relative_last_activity(total_layers, i);
+                    (
+                        // Secondary locations' layers are always considered above the min resident size,
+                        // i.e. secondary locations are permitted to be trimmed to zero layers if all
+                        // the layers have sufficiently old access times.
+                        MinResidentSizePartition::Above,
+                        candidate,
+                    )
+                });
+
+        candidates.extend(tenant_candidates);
+
+        tokio::task::yield_now().await;
    }

    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
@@ -1076,7 +1100,6 @@ mod filesystem_level_usage {
    use super::DiskUsageEvictionTaskConfig;

    #[derive(Debug, Clone, Copy)]
-    #[allow(dead_code)]
    pub struct Usage<'a> {
        config: &'a DiskUsageEvictionTaskConfig,

--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -19,11 +19,14 @@ use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
 use pageserver_api::models::TenantShardLocation;
+use pageserver_api::models::TenantShardSplitRequest;
+use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
    TenantLoadRequest, TenantLocationConfigRequest,
 };
+use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
@@ -419,6 +422,7 @@ async fn build_timeline_info_common(
            tenant::timeline::logical_size::Accuracy::Approximate => false,
            tenant::timeline::logical_size::Accuracy::Exact => true,
        },
+        directory_entries_counts: timeline.get_directory_metrics().to_vec(),
        current_physical_size,
        current_logical_size_non_incremental: None,
        timeline_dir_layer_file_size_sum: None,
@@ -485,7 +489,9 @@ async fn timeline_create_handler(
    let state = get_state(&request);

    async {
-        let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id, false)?;

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

@@ -495,48 +501,62 @@ async fn timeline_create_handler(
            tracing::info!("bootstrapping");
        }

-        match tenant.create_timeline(
-            new_timeline_id,
-            request_data.ancestor_timeline_id.map(TimelineId::from),
-            request_data.ancestor_start_lsn,
-            request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
-            request_data.existing_initdb_timeline_id,
-            state.broker_client.clone(),
-            &ctx,
-        )
-        .await {
+        match tenant
+            .create_timeline(
+                new_timeline_id,
+                request_data.ancestor_timeline_id,
+                request_data.ancestor_start_lsn,
+                request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+                request_data.existing_initdb_timeline_id,
+                state.broker_client.clone(),
+                &ctx,
+            )
+            .await
+        {
            Ok(new_timeline) => {
                // Created. Construct a TimelineInfo for it.
-                let timeline_info = build_timeline_info_common(&new_timeline, &ctx, tenant::timeline::GetLogicalSizePriority::User)
-                    .await
-                    .map_err(ApiError::InternalServerError)?;
+                let timeline_info = build_timeline_info_common(
+                    &new_timeline,
+                    &ctx,
+                    tenant::timeline::GetLogicalSizePriority::User,
+                )
+                .await
+                .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
            }
            Err(_) if tenant.cancel.is_cancelled() => {
                // In case we get some ugly error type during shutdown, cast it into a clean 503.
-                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()))
-            }
-            Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
-                json_response(StatusCode::CONFLICT, ())
-            }
-            Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
-                json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(
-                    format!("{err:#}")
-                ))
-            }
-            Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
-                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
-            }
-            Err(tenant::CreateTimelineError::ShuttingDown) => {
-                json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string()))
+                json_response(
+                    StatusCode::SERVICE_UNAVAILABLE,
+                    HttpErrorBody::from_msg("Tenant shutting down".to_string()),
+                )
            }
+            Err(
+                tenant::CreateTimelineError::Conflict
+                | tenant::CreateTimelineError::AlreadyCreating,
+            ) => json_response(StatusCode::CONFLICT, ()),
+            Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
+                StatusCode::NOT_ACCEPTABLE,
+                HttpErrorBody::from_msg(format!("{err:#}")),
+            ),
+            Err(e @ tenant::CreateTimelineError::AncestorNotActive) => json_response(
+                StatusCode::SERVICE_UNAVAILABLE,
+                HttpErrorBody::from_msg(e.to_string()),
+            ),
+            Err(tenant::CreateTimelineError::ShuttingDown) => json_response(
+                StatusCode::SERVICE_UNAVAILABLE,
+                HttpErrorBody::from_msg("tenant shutting down".to_string()),
+            ),
            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
    .instrument(info_span!("timeline_create",
        tenant_id = %tenant_shard_id.tenant_id,
        shard_id = %tenant_shard_id.shard_slug(),
-        timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+        timeline_id = %new_timeline_id,
+        lsn=?request_data.ancestor_start_lsn,
+        pg_version=?request_data.pg_version
+    ))
    .await
 }

@@ -875,7 +895,7 @@ async fn tenant_reset_handler(
    let state = get_state(&request);
    state
        .tenant_manager
-        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
+        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), &ctx)
        .await
        .map_err(ApiError::InternalServerError)?;

@@ -1104,6 +1124,25 @@ async fn tenant_size_handler(
    )
 }

+async fn tenant_shard_split_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let req: TenantShardSplitRequest = json_request(&mut request).await?;
+
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let state = get_state(&request);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let new_shards = state
+        .tenant_manager
+        .shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, TenantShardSplitResponse { new_shards })
+}
+
 async fn layer_map_info_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -1908,6 +1947,16 @@ async fn post_tracing_event_handler(
    json_response(StatusCode::OK, ())
 }

+async fn put_io_engine_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+    let kind: crate::virtual_file::IoEngineKind = json_request(&mut r).await?;
+    crate::virtual_file::io_engine::set(kind);
+    json_response(StatusCode::OK, ())
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2054,6 +2103,9 @@ pub fn make_router(
        .put("/v1/tenant/config", |r| {
            api_handler(r, update_tenant_config_handler)
        })
+        .put("/v1/tenant/:tenant_shard_id/shard_split", |r| {
+            api_handler(r, tenant_shard_split_handler)
+        })
        .get("/v1/tenant/:tenant_shard_id/config", |r| {
            api_handler(r, get_tenant_config_handler)
        })
@@ -2163,7 +2215,8 @@ pub fn make_router(
        )
        .get(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
-            |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
+            |r| api_handler(r, timeline_collect_keyspace),
        )
+        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
        .any(handler_404))
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -602,6 +602,15 @@ pub(crate) mod initial_logical_size {
        });
 }

+static DIRECTORY_ENTRIES_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_directory_entries_count",
+        "Sum of the entries in pageserver-stored directory listings",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_states_count",
@@ -1809,6 +1818,7 @@ pub(crate) struct TimelineMetrics {
    resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
+    pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
    pub num_persistent_files_created: IntCounter,
    pub persistent_bytes_written: IntCounter,
    pub evictions: IntCounter,
@@ -1818,12 +1828,12 @@ pub(crate) struct TimelineMetrics {
 impl TimelineMetrics {
    pub fn new(
        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
+        timeline_id_raw: &TimelineId,
        evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
    ) -> Self {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_id = format!("{}", tenant_shard_id.shard_slug());
-        let timeline_id = timeline_id.to_string();
+        let timeline_id = timeline_id_raw.to_string();
        let flush_time_histo = StorageTimeMetrics::new(
            StorageTimeOperation::LayerFlush,
            &tenant_id,
@@ -1876,6 +1886,22 @@ impl TimelineMetrics {
        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
+        // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
+        let directory_entries_count_gauge_closure = {
+            let tenant_shard_id = *tenant_shard_id;
+            let timeline_id_raw = *timeline_id_raw;
+            move || {
+                let tenant_id = tenant_shard_id.tenant_id.to_string();
+                let shard_id = format!("{}", tenant_shard_id.shard_slug());
+                let timeline_id = timeline_id_raw.to_string();
+                let gauge: UIntGauge = DIRECTORY_ENTRIES_COUNT
+                    .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+                    .unwrap();
+                gauge
+            }
+        };
+        let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
+            Lazy::new(Box::new(directory_entries_count_gauge_closure));
        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -1902,6 +1928,7 @@ impl TimelineMetrics {
            last_record_gauge,
            resident_physical_size_gauge,
            current_logical_size_gauge,
+            directory_entries_count_gauge,
            num_persistent_files_created,
            persistent_bytes_written,
            evictions,
@@ -1944,6 +1971,9 @@ impl Drop for TimelineMetrics {
                RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        }
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
+            let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        }
        let _ =
            NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
@@ -2400,6 +2430,72 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
    }
 }

+pub mod tokio_epoll_uring {
+    use metrics::UIntGauge;
+
+    pub struct Collector {
+        descs: Vec<metrics::core::Desc>,
+        systems_created: UIntGauge,
+        systems_destroyed: UIntGauge,
+    }
+
+    const NMETRICS: usize = 2;
+
+    impl metrics::core::Collector for Collector {
+        fn desc(&self) -> Vec<&metrics::core::Desc> {
+            self.descs.iter().collect()
+        }
+
+        fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
+            let mut mfs = Vec::with_capacity(NMETRICS);
+            let tokio_epoll_uring::metrics::Metrics {
+                systems_created,
+                systems_destroyed,
+            } = tokio_epoll_uring::metrics::global();
+            self.systems_created.set(systems_created);
+            mfs.extend(self.systems_created.collect());
+            self.systems_destroyed.set(systems_destroyed);
+            mfs.extend(self.systems_destroyed.collect());
+            mfs
+        }
+    }
+
+    impl Collector {
+        #[allow(clippy::new_without_default)]
+        pub fn new() -> Self {
+            let mut descs = Vec::new();
+
+            let systems_created = UIntGauge::new(
+                "pageserver_tokio_epoll_uring_systems_created",
+                "counter of tokio-epoll-uring systems that were created",
+            )
+            .unwrap();
+            descs.extend(
+                metrics::core::Collector::desc(&systems_created)
+                    .into_iter()
+                    .cloned(),
+            );
+
+            let systems_destroyed = UIntGauge::new(
+                "pageserver_tokio_epoll_uring_systems_destroyed",
+                "counter of tokio-epoll-uring systems that were destroyed",
+            )
+            .unwrap();
+            descs.extend(
+                metrics::core::Collector::desc(&systems_destroyed)
+                    .into_iter()
+                    .cloned(),
+            );
+
+            Self {
+                descs,
+                systems_created,
+                systems_destroyed,
+            }
+        }
+    }
+}
+
 pub fn preinitialize_metrics() {
    // Python tests need these and on some we do alerting.
    //
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -91,8 +91,8 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 /// `tokio_tar` already read the first such block. Read the second all-zeros block,
 /// and check that there is no more data after the EOF marker.
 ///
-/// XXX: Currently, any trailing data after the EOF marker prints a warning.
-/// Perhaps it should be a hard error?
+/// 'tar' command can also write extra blocks of zeros, up to a record
+/// size, controlled by the --record-size argument. Ignore them too.
 async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
    use tokio::io::AsyncReadExt;
    let mut buf = [0u8; 512];
@@ -113,17 +113,24 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
        anyhow::bail!("invalid tar EOF marker");
    }

-    // Drain any data after the EOF marker
+    // Drain any extra zero-blocks after the EOF marker
    let mut trailing_bytes = 0;
+    let mut seen_nonzero_bytes = false;
    loop {
        let nbytes = reader.read(&mut buf).await?;
        trailing_bytes += nbytes;
+        if !buf.iter().all(|&x| x == 0) {
+            seen_nonzero_bytes = true;
+        }
        if nbytes == 0 {
            break;
        }
    }
-    if trailing_bytes > 0 {
-        warn!("ignored {trailing_bytes} unexpected bytes after the tar archive");
+    if seen_nonzero_bytes {
+        anyhow::bail!("unexpected non-zero bytes after the tar archive");
+    }
+    if trailing_bytes % 512 != 0 {
+        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
    }
    Ok(())
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -14,6 +14,7 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_i
 use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
+use enum_map::Enum;
 use pageserver_api::key::{
    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -155,6 +156,8 @@ impl Timeline {
            pending_updates: HashMap::new(),
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
+            pending_aux_files: None,
+            pending_directory_entries: Vec::new(),
            lsn,
        }
    }
@@ -868,6 +871,15 @@ pub struct DatadirModification<'a> {
    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
    pending_deletions: Vec<(Range<Key>, Lsn)>,
    pending_nblocks: i64,
+
+    // If we already wrote any aux file changes in this modification, stash the latest dir.  If set,
+    // [`Self::put_file`] may assume that it is safe to emit a delta rather than checking
+    // if AUX_FILES_KEY is already set.
+    pending_aux_files: Option<AuxFilesDirectory>,
+
+    /// For special "directory" keys that store key-value maps, track the size of the map
+    /// if it was updated in this modification.
+    pending_directory_entries: Vec<(DirectoryKind, usize)>,
 }

 impl<'a> DatadirModification<'a> {
@@ -899,6 +911,7 @@ impl<'a> DatadirModification<'a> {
        let buf = DbDirectory::ser(&DbDirectory {
            dbdirs: HashMap::new(),
        })?;
+        self.pending_directory_entries.push((DirectoryKind::Db, 0));
        self.put(DBDIR_KEY, Value::Image(buf.into()));

        // Create AuxFilesDirectory
@@ -907,16 +920,24 @@ impl<'a> DatadirModification<'a> {
        let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
            xids: HashSet::new(),
        })?;
+        self.pending_directory_entries
+            .push((DirectoryKind::TwoPhase, 0));
        self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));

        let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
        let empty_dir = Value::Image(buf);
        self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
+        self.pending_directory_entries
+            .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
        self.put(
            slru_dir_to_key(SlruKind::MultiXactMembers),
            empty_dir.clone(),
        );
+        self.pending_directory_entries
+            .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
        self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
+        self.pending_directory_entries
+            .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));

        Ok(())
    }
@@ -1017,6 +1038,7 @@ impl<'a> DatadirModification<'a> {
            let buf = RelDirectory::ser(&RelDirectory {
                rels: HashSet::new(),
            })?;
+            self.pending_directory_entries.push((DirectoryKind::Rel, 0));
            self.put(
                rel_dir_to_key(spcnode, dbnode),
                Value::Image(Bytes::from(buf)),
@@ -1039,6 +1061,8 @@ impl<'a> DatadirModification<'a> {
        if !dir.xids.insert(xid) {
            anyhow::bail!("twophase file for xid {} already exists", xid);
        }
+        self.pending_directory_entries
+            .push((DirectoryKind::TwoPhase, dir.xids.len()));
        self.put(
            TWOPHASEDIR_KEY,
            Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
@@ -1074,6 +1098,8 @@ impl<'a> DatadirModification<'a> {
        let mut dir = DbDirectory::des(&buf)?;
        if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
            let buf = DbDirectory::ser(&dir)?;
+            self.pending_directory_entries
+                .push((DirectoryKind::Db, dir.dbdirs.len()));
            self.put(DBDIR_KEY, Value::Image(buf.into()));
        } else {
            warn!(
@@ -1111,6 +1137,8 @@ impl<'a> DatadirModification<'a> {
            // Didn't exist. Update dbdir
            dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
            let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
+            self.pending_directory_entries
+                .push((DirectoryKind::Db, dbdir.dbdirs.len()));
            self.put(DBDIR_KEY, Value::Image(buf.into()));

            // and create the RelDirectory
@@ -1125,6 +1153,10 @@ impl<'a> DatadirModification<'a> {
        if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
            return Err(RelationError::AlreadyExists);
        }
+
+        self.pending_directory_entries
+            .push((DirectoryKind::Rel, rel_dir.rels.len()));
+
        self.put(
            rel_dir_key,
            Value::Image(Bytes::from(
@@ -1216,6 +1248,9 @@ impl<'a> DatadirModification<'a> {
        let buf = self.get(dir_key, ctx).await?;
        let mut dir = RelDirectory::des(&buf)?;

+        self.pending_directory_entries
+            .push((DirectoryKind::Rel, dir.rels.len()));
+
        if dir.rels.remove(&(rel.relnode, rel.forknum)) {
            self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
        } else {
@@ -1251,6 +1286,8 @@ impl<'a> DatadirModification<'a> {
        if !dir.segments.insert(segno) {
            anyhow::bail!("slru segment {kind:?}/{segno} already exists");
        }
+        self.pending_directory_entries
+            .push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
        self.put(
            dir_key,
            Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -1295,6 +1332,8 @@ impl<'a> DatadirModification<'a> {
        if !dir.segments.remove(&segno) {
            warn!("slru segment {:?}/{} does not exist", kind, segno);
        }
+        self.pending_directory_entries
+            .push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
        self.put(
            dir_key,
            Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -1325,6 +1364,8 @@ impl<'a> DatadirModification<'a> {
        if !dir.xids.remove(&xid) {
            warn!("twophase file for xid {} does not exist", xid);
        }
+        self.pending_directory_entries
+            .push((DirectoryKind::TwoPhase, dir.xids.len()));
        self.put(
            TWOPHASEDIR_KEY,
            Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
@@ -1340,6 +1381,8 @@ impl<'a> DatadirModification<'a> {
        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
            files: HashMap::new(),
        })?;
+        self.pending_directory_entries
+            .push((DirectoryKind::AuxFiles, 0));
        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
        Ok(())
    }
@@ -1350,28 +1393,76 @@ impl<'a> DatadirModification<'a> {
        content: &[u8],
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
-            Ok(buf) => AuxFilesDirectory::des(&buf)?,
-            Err(e) => {
-                // This is expected: historical databases do not have the key.
-                debug!("Failed to get info about AUX files: {}", e);
-                AuxFilesDirectory {
-                    files: HashMap::new(),
+        let file_path = path.to_string();
+        let content = if content.is_empty() {
+            None
+        } else {
+            Some(Bytes::copy_from_slice(content))
+        };
+
+        let dir = if let Some(mut dir) = self.pending_aux_files.take() {
+            // We already updated aux files in `self`: emit a delta and update our latest value
+
+            self.put(
+                AUX_FILES_KEY,
+                Value::WalRecord(NeonWalRecord::AuxFile {
+                    file_path: file_path.clone(),
+                    content: content.clone(),
+                }),
+            );
+
+            dir.upsert(file_path, content);
+            dir
+        } else {
+            // Check if the AUX_FILES_KEY is initialized
+            match self.get(AUX_FILES_KEY, ctx).await {
+                Ok(dir_bytes) => {
+                    let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
+                    // Key is already set, we may append a delta
+                    self.put(
+                        AUX_FILES_KEY,
+                        Value::WalRecord(NeonWalRecord::AuxFile {
+                            file_path: file_path.clone(),
+                            content: content.clone(),
+                        }),
+                    );
+                    dir.upsert(file_path, content);
+                    dir
+                }
+                Err(
+                    e @ (PageReconstructError::AncestorStopping(_)
+                    | PageReconstructError::Cancelled
+                    | PageReconstructError::AncestorLsnTimeout(_)),
+                ) => {
+                    // Important that we do not interpret a shutdown error as "not found" and thereby
+                    // reset the map.
+                    return Err(e.into());
+                }
+                // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
+                // we are assuming that all _other_ possible errors represents a missing key.  If some
+                // other error occurs, we may incorrectly reset the map of aux files.
+                Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
+                    // Key is missing, we must insert an image as the basis for subsequent deltas.
+
+                    let mut dir = AuxFilesDirectory {
+                        files: HashMap::new(),
+                    };
+                    dir.upsert(file_path, content);
+                    self.put(
+                        AUX_FILES_KEY,
+                        Value::Image(Bytes::from(
+                            AuxFilesDirectory::ser(&dir).context("serialize")?,
+                        )),
+                    );
+                    dir
                }
            }
        };
-        let path = path.to_string();
-        if content.is_empty() {
-            dir.files.remove(&path);
-        } else {
-            dir.files.insert(path, Bytes::copy_from_slice(content));
-        }
-        self.put(
-            AUX_FILES_KEY,
-            Value::Image(Bytes::from(
-                AuxFilesDirectory::ser(&dir).context("serialize")?,
-            )),
-        );
+
+        self.pending_directory_entries
+            .push((DirectoryKind::AuxFiles, dir.files.len()));
+        self.pending_aux_files = Some(dir);
+
        Ok(())
    }

@@ -1427,6 +1518,10 @@ impl<'a> DatadirModification<'a> {
            self.pending_nblocks = 0;
        }

+        for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
+            writer.update_directory_entries_count(kind, count as u64);
+        }
+
        Ok(())
    }

@@ -1464,6 +1559,10 @@ impl<'a> DatadirModification<'a> {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
        }

+        for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
+            writer.update_directory_entries_count(kind, count as u64);
+        }
+
        Ok(())
    }

@@ -1573,8 +1672,18 @@ struct RelDirectory {
 }

 #[derive(Debug, Serialize, Deserialize, Default)]
-struct AuxFilesDirectory {
-    files: HashMap<String, Bytes>,
+pub(crate) struct AuxFilesDirectory {
+    pub(crate) files: HashMap<String, Bytes>,
+}
+
+impl AuxFilesDirectory {
+    pub(crate) fn upsert(&mut self, key: String, value: Option<Bytes>) {
+        if let Some(value) = value {
+            self.files.insert(key, value);
+        } else {
+            self.files.remove(&key);
+        }
+    }
 }

 #[derive(Debug, Serialize, Deserialize)]
@@ -1588,13 +1697,82 @@ struct SlruSegmentDirectory {
    segments: HashSet<u32>,
 }

+#[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)]
+#[repr(u8)]
+pub(crate) enum DirectoryKind {
+    Db,
+    TwoPhase,
+    Rel,
+    AuxFiles,
+    SlruSegment(SlruKind),
+}
+
+impl DirectoryKind {
+    pub(crate) const KINDS_NUM: usize = <DirectoryKind as Enum>::LENGTH;
+    pub(crate) fn offset(&self) -> usize {
+        self.into_usize()
+    }
+}
+
 static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);

 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
-    //use super::repo_harness::*;
-    //use super::*;
+    use hex_literal::hex;
+    use utils::id::TimelineId;
+
+    use super::*;
+
+    use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
+
+    /// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline
+    #[tokio::test]
+    async fn aux_files_round_trip() -> anyhow::Result<()> {
+        let name = "aux_files_round_trip";
+        let harness = TenantHarness::create(name)?;
+
+        pub const TIMELINE_ID: TimelineId =
+            TimelineId::from_array(hex!("11223344556677881122334455667788"));
+
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        let tline = tline.raw_timeline().unwrap();
+
+        // First modification: insert two keys
+        let mut modification = tline.begin_modification(Lsn(0x1000));
+        modification.put_file("foo/bar1", b"content1", &ctx).await?;
+        modification.set_lsn(Lsn(0x1008))?;
+        modification.put_file("foo/bar2", b"content2", &ctx).await?;
+        modification.commit(&ctx).await?;
+        let expect_1008 = HashMap::from([
+            ("foo/bar1".to_string(), Bytes::from_static(b"content1")),
+            ("foo/bar2".to_string(), Bytes::from_static(b"content2")),
+        ]);
+
+        let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
+        assert_eq!(readback, expect_1008);
+
+        // Second modification: update one key, remove the other
+        let mut modification = tline.begin_modification(Lsn(0x2000));
+        modification.put_file("foo/bar1", b"content3", &ctx).await?;
+        modification.set_lsn(Lsn(0x2008))?;
+        modification.put_file("foo/bar2", b"", &ctx).await?;
+        modification.commit(&ctx).await?;
+        let expect_2008 =
+            HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]);
+
+        let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?;
+        assert_eq!(readback, expect_2008);
+
+        // Reading back in time works
+        let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
+        assert_eq!(readback, expect_1008);
+
+        Ok(())
+    }

    /*
        fn assert_current_logical_size<R: Repository>(timeline: &DatadirTimeline<R>, lsn: Lsn) {
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -30,10 +30,6 @@
 //! only a single tenant or timeline.
 //!

-// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro.
-// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224.
-#![allow(clippy::declare_interior_mutable_const)]
-
 use std::collections::HashMap;
 use std::fmt;
 use std::future::Future;
@@ -312,7 +308,6 @@ struct MutableTaskState {
 }

 struct PageServerTask {
-    #[allow(dead_code)] // unused currently
    task_id: PageserverTaskId,

    kind: TaskKind,
@@ -576,8 +571,8 @@ pub fn shutdown_token() -> CancellationToken {

 /// Has the current task been requested to shut down?
 pub fn is_shutdown_requested() -> bool {
-    if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
-        cancel.is_cancelled()
+    if let Ok(true_or_false) = SHUTDOWN_TOKEN.try_with(|t| t.is_cancelled()) {
+        true_or_false
    } else {
        if !cfg!(test) {
            warn!("is_shutdown_requested() called in an unexpected task or thread");
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -25,6 +25,7 @@ use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
+use remote_storage::TimeoutOrCancel;
 use std::fmt;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
@@ -53,6 +54,7 @@ use self::metadata::TimelineMetadata;
 use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
+use self::remote_timeline_client::upload::upload_index_part;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::TimelineUninitMark;
@@ -643,10 +645,10 @@ impl Tenant {

        // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
        // we shut down while attaching.
-        let Ok(attach_gate_guard) = tenant.gate.enter() else {
-            // We just created the Tenant: nothing else can have shut it down yet
-            unreachable!();
-        };
+        let attach_gate_guard = tenant
+            .gate
+            .enter()
+            .expect("We just created the Tenant: nothing else can have shut it down yet");

        // Do all the hard work in the background
        let tenant_clone = Arc::clone(&tenant);
@@ -754,36 +756,27 @@ impl Tenant {
                    AttachType::Normal
                };

-                let preload_timer = TENANT.preload.start_timer();
-                let preload = match mode {
-                    SpawnMode::Create => {
-                        // Don't count the skipped preload into the histogram of preload durations
-                        preload_timer.stop_and_discard();
+                let preload = match (&mode, &remote_storage) {
+                    (SpawnMode::Create, _) => {
                        None
                    },
-                    SpawnMode::Normal => {
-                        match &remote_storage {
-                            Some(remote_storage) => Some(
-                                match tenant_clone
-                                    .preload(remote_storage, task_mgr::shutdown_token())
-                                    .instrument(
-                                        tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()),
-                                    )
-                                    .await {
-                                        Ok(p) => {
-                                            preload_timer.observe_duration();
-                                            p
-                                        }
-                                            ,
-                                        Err(e) => {
-                                            make_broken(&tenant_clone, anyhow::anyhow!(e));
-                                                return Ok(());
-                                        }
-                                    },
-                            ),
-                            None => None,
+                    (SpawnMode::Normal, Some(remote_storage)) => {
+                        let _preload_timer = TENANT.preload.start_timer();
+                        let res = tenant_clone
+                            .preload(remote_storage, task_mgr::shutdown_token())
+                            .await;
+                        match res {
+                            Ok(p) => Some(p),
+                            Err(e) => {
+                                make_broken(&tenant_clone, anyhow::anyhow!(e));
+                                return Ok(());
+                            }
                        }
                    }
+                    (SpawnMode::Normal, None) => {
+                        let _preload_timer = TENANT.preload.start_timer();
+                        None
+                    }
                };

                // Remote preload is complete.
@@ -819,36 +812,37 @@ impl Tenant {
                        info!("ready for backgound jobs barrier");
                    }

-                    match DeleteTenantFlow::resume_from_attach(
+                    let deleted = DeleteTenantFlow::resume_from_attach(
                        deletion,
                        &tenant_clone,
                        preload,
                        tenants,
                        &ctx,
                    )
-                    .await
-                    {
-                        Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err));
-                            return Ok(());
-                        }
-                        Ok(()) => return Ok(()),
+                    .await;
+
+                    if let Err(e) = deleted {
+                        make_broken(&tenant_clone, anyhow::anyhow!(e));
                    }
+
+                    return Ok(());
                }

                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
-                let attach_timer = match mode {
-                    SpawnMode::Create => None,
-                    SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
+                let attached = {
+                    let _attach_timer = match mode {
+                        SpawnMode::Create => None,
+                        SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
+                    };
+                    tenant_clone.attach(preload, mode, &ctx).await
                };
-                match tenant_clone.attach(preload, mode, &ctx).await {
+
+                match attached {
                    Ok(()) => {
                        info!("attach finished, activating");
-                        if let Some(t)=  attach_timer {t.observe_duration();}
                        tenant_clone.activate(broker_client, None, &ctx);
                    }
                    Err(e) => {
-                        if let Some(t)=  attach_timer {t.observe_duration();}
                        make_broken(&tenant_clone, anyhow::anyhow!(e));
                    }
                }
@@ -861,34 +855,26 @@ impl Tenant {
                // logical size calculations: if logical size calculation semaphore is saturated,
                // then warmup will wait for that before proceeding to the next tenant.
                if let AttachType::Warmup(_permit) = attach_type {
-                    let mut futs = FuturesUnordered::new();
-                    let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect();
-                    for t in timelines {
-                        futs.push(t.await_initial_logical_size())
-                    }
+                    let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
                    tracing::info!("Waiting for initial logical sizes while warming up...");
-                    while futs.next().await.is_some() {
-
-                    }
+                    while futs.next().await.is_some() {}
                    tracing::info!("Warm-up complete");
                }

                Ok(())
            }
-            .instrument({
-                let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation);
-                span.follows_from(Span::current());
-                span
-            }),
+            .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
        );
        Ok(tenant)
    }

+    #[instrument(skip_all)]
    pub(crate) async fn preload(
        self: &Arc<Tenant>,
        remote_storage: &GenericRemoteStorage,
        cancel: CancellationToken,
    ) -> anyhow::Result<TenantPreload> {
+        span::debug_assert_current_span_has_tenant_id();
        // Get list of remote timelines
        // download index files for every tenant timeline
        info!("listing remote timelines");
@@ -1376,7 +1362,7 @@ impl Tenant {
                async move {
                    debug!("starting index part download");

-                    let index_part = client.download_index_file(cancel_clone).await;
+                    let index_part = client.download_index_file(&cancel_clone).await;

                    debug!("finished index part download");

@@ -2397,6 +2383,67 @@ impl Tenant {
    pub(crate) fn get_generation(&self) -> Generation {
        self.generation
    }
+
+    /// This function partially shuts down the tenant (it shuts down the Timelines) and is fallible,
+    /// and can leave the tenant in a bad state if it fails.  The caller is responsible for
+    /// resetting this tenant to a valid state if we fail.
+    pub(crate) async fn split_prepare(
+        &self,
+        child_shards: &Vec<TenantShardId>,
+    ) -> anyhow::Result<()> {
+        let timelines = self.timelines.lock().unwrap().clone();
+        for timeline in timelines.values() {
+            let Some(tl_client) = &timeline.remote_client else {
+                anyhow::bail!("Remote storage is mandatory");
+            };
+
+            let Some(remote_storage) = &self.remote_storage else {
+                anyhow::bail!("Remote storage is mandatory");
+            };
+
+            // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
+            // to ensure that they do not start a split if currently in the process of doing these.
+
+            // Upload an index from the parent: this is partly to provide freshness for the
+            // child tenants that will copy it, and partly for general ease-of-debugging: there will
+            // always be a parent shard index in the same generation as we wrote the child shard index.
+            tl_client.schedule_index_upload_for_file_changes()?;
+            tl_client.wait_completion().await?;
+
+            // Shut down the timeline's remote client: this means that the indices we write
+            // for child shards will not be invalidated by the parent shard deleting layers.
+            tl_client.shutdown().await?;
+
+            // Download methods can still be used after shutdown, as they don't flow through the remote client's
+            // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
+            // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
+            // we use here really is the remotely persistent one).
+            let result = tl_client
+                .download_index_file(&self.cancel)
+                .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
+                .await?;
+            let index_part = match result {
+                MaybeDeletedIndexPart::Deleted(_) => {
+                    anyhow::bail!("Timeline deletion happened concurrently with split")
+                }
+                MaybeDeletedIndexPart::IndexPart(p) => p,
+            };
+
+            for child_shard in child_shards {
+                upload_index_part(
+                    remote_storage,
+                    child_shard,
+                    &timeline.timeline_id,
+                    self.generation,
+                    &index_part,
+                    &self.cancel,
+                )
+                .await?;
+            }
+        }
+
+        Ok(())
+    }
 }

 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -2834,7 +2881,7 @@ impl Tenant {
        let config_path = config_path.to_owned();
        tokio::task::spawn_blocking(move || {
            Handle::current().block_on(async move {
-                let conf_content = conf_content.as_bytes();
+                let conf_content = conf_content.into_bytes();
                VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
                    .await
                    .with_context(|| {
@@ -2871,7 +2918,7 @@ impl Tenant {
        let target_config_path = target_config_path.to_owned();
        tokio::task::spawn_blocking(move || {
            Handle::current().block_on(async move {
-                let conf_content = conf_content.as_bytes();
+                let conf_content = conf_content.into_bytes();
                VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
                    .await
                    .with_context(|| {
@@ -3229,7 +3276,7 @@ impl Tenant {

    /// For unit tests, make this visible so that other modules can directly create timelines
    #[cfg(test)]
-    #[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
+    #[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
    pub(crate) async fn bootstrap_timeline_test(
        &self,
        timeline_id: TimelineId,
@@ -3293,7 +3340,7 @@ impl Tenant {
            &self.cancel,
        )
        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
        .and_then(|x| x)
    }

@@ -3343,8 +3390,10 @@ impl Tenant {
                );
                let dest_path =
                    &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);
+
+                // if this fails, it will get retried by retried control plane requests
                storage
-                    .copy_object(source_path, dest_path)
+                    .copy_object(source_path, dest_path, &self.cancel)
                    .await
                    .context("copy initdb tar")?;
            }
@@ -3732,6 +3781,10 @@ impl Tenant {

        Ok(())
    }
+
+    pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
+        self.tenant_conf.read().unwrap().tenant_conf
+    }
 }

 fn remove_timeline_and_uninit_mark(
@@ -3864,6 +3917,7 @@ pub(crate) mod harness {
    use utils::lsn::Lsn;

    use crate::deletion_queue::mock::MockDeletionQueue;
+    use crate::walredo::apply_neon;
    use crate::{
        config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
    };
@@ -3879,8 +3933,7 @@ pub(crate) mod harness {
        TimelineId::from_array(hex!("AA223344556677881122334455667788"));

    /// Convenience function to create a page image with given string as the only content
-    #[allow(non_snake_case)]
-    pub fn TEST_IMG(s: &str) -> Bytes {
+    pub fn test_img(s: &str) -> Bytes {
        let mut buf = BytesMut::new();
        buf.extend_from_slice(s.as_bytes());
        buf.resize(64, 0);
@@ -3916,6 +3969,8 @@ pub(crate) mod harness {
        }
    }

+    #[cfg(test)]
+    #[derive(Debug)]
    enum LoadMode {
        Local,
        Remote,
@@ -3978,6 +4033,7 @@ pub(crate) mod harness {
            std::fs::create_dir_all(&remote_fs_dir).unwrap();
            let config = RemoteStorageConfig {
                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+                timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
            };
            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
            let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
@@ -3998,7 +4054,7 @@ pub(crate) mod harness {
            info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
        }

-        pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
+        pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
            (
                self.try_load(&ctx)
@@ -4008,6 +4064,74 @@ pub(crate) mod harness {
            )
        }

+        /// For tests that specifically want to exercise the local load path, which does
+        /// not use remote storage.
+        pub(crate) async fn try_load_local(
+            &self,
+            ctx: &RequestContext,
+        ) -> anyhow::Result<Arc<Tenant>> {
+            self.do_try_load(ctx, LoadMode::Local).await
+        }
+
+        /// The 'load' in this function is either a local load or a normal attachment,
+        pub(crate) async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
+            // If we have nothing in remote storage, must use load_local instead of attach: attach
+            // will error out if there are no timelines.
+            //
+            // See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate
+            // this weird state of a Tenant which exists but doesn't have any timelines.
+            let mode = match self.remote_empty() {
+                true => LoadMode::Local,
+                false => LoadMode::Remote,
+            };
+
+            self.do_try_load(ctx, mode).await
+        }
+
+        #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), ?mode))]
+        async fn do_try_load(
+            &self,
+            ctx: &RequestContext,
+            mode: LoadMode,
+        ) -> anyhow::Result<Arc<Tenant>> {
+            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
+
+            let tenant = Arc::new(Tenant::new(
+                TenantState::Loading,
+                self.conf,
+                AttachedTenantConf::try_from(LocationConf::attached_single(
+                    TenantConfOpt::from(self.tenant_conf),
+                    self.generation,
+                    &ShardParameters::default(),
+                ))
+                .unwrap(),
+                // This is a legacy/test code path: sharding isn't supported here.
+                ShardIdentity::unsharded(),
+                Some(walredo_mgr),
+                self.tenant_shard_id,
+                Some(self.remote_storage.clone()),
+                self.deletion_queue.new_client(),
+            ));
+
+            match mode {
+                LoadMode::Local => {
+                    tenant.load_local(ctx).await?;
+                }
+                LoadMode::Remote => {
+                    let preload = tenant
+                        .preload(&self.remote_storage, CancellationToken::new())
+                        .await?;
+                    tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
+                }
+            }
+
+            tenant.state.send_replace(TenantState::Active);
+            for timeline in tenant.timelines.lock().unwrap().values() {
+                timeline.set_state(TimelineState::Active);
+            }
+            Ok(tenant)
+        }
+
        fn remote_empty(&self) -> bool {
            let tenant_path = self.conf.tenant_path(&self.tenant_shard_id);
            let remote_tenant_dir = self
@@ -4033,77 +4157,6 @@ pub(crate) mod harness {
            }
        }

-        async fn do_try_load(
-            &self,
-            ctx: &RequestContext,
-            mode: LoadMode,
-        ) -> anyhow::Result<Arc<Tenant>> {
-            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
-
-            let tenant = Arc::new(Tenant::new(
-                TenantState::Loading,
-                self.conf,
-                AttachedTenantConf::try_from(LocationConf::attached_single(
-                    TenantConfOpt::from(self.tenant_conf),
-                    self.generation,
-                    &ShardParameters::default(),
-                ))
-                .unwrap(),
-                // This is a legacy/test code path: sharding isn't supported here.
-                ShardIdentity::unsharded(),
-                Some(walredo_mgr),
-                self.tenant_shard_id,
-                Some(self.remote_storage.clone()),
-                self.deletion_queue.new_client(),
-            ));
-
-            match mode {
-                LoadMode::Local => {
-                    tenant
-                        .load_local(ctx)
-                        .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
-                        .await?;
-                }
-                LoadMode::Remote => {
-                    let preload = tenant
-                        .preload(&self.remote_storage, CancellationToken::new())
-                        .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
-                        .await?;
-                    tenant
-                        .attach(Some(preload), SpawnMode::Normal, ctx)
-                        .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
-                        .await?;
-                }
-            }
-
-            tenant.state.send_replace(TenantState::Active);
-            for timeline in tenant.timelines.lock().unwrap().values() {
-                timeline.set_state(TimelineState::Active);
-            }
-            Ok(tenant)
-        }
-
-        /// For tests that specifically want to exercise the local load path, which does
-        /// not use remote storage.
-        pub async fn try_load_local(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
-            self.do_try_load(ctx, LoadMode::Local).await
-        }
-
-        /// The 'load' in this function is either a local load or a normal attachment,
-        pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
-            // If we have nothing in remote storage, must use load_local instead of attach: attach
-            // will error out if there are no timelines.
-            //
-            // See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate
-            // this weird state of a Tenant which exists but doesn't have any timelines.
-            let mode = match self.remote_empty() {
-                true => LoadMode::Local,
-                false => LoadMode::Remote,
-            };
-
-            self.do_try_load(ctx, mode).await
-        }
-
        pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
            self.conf.timeline_path(&self.tenant_shard_id, timeline_id)
        }
@@ -4124,20 +4177,33 @@ pub(crate) mod harness {
            records: Vec<(Lsn, NeonWalRecord)>,
            _pg_version: u32,
        ) -> anyhow::Result<Bytes> {
-            let s = format!(
-                "redo for {} to get to {}, with {} and {} records",
-                key,
-                lsn,
-                if base_img.is_some() {
-                    "base image"
-                } else {
-                    "no base image"
-                },
-                records.len()
-            );
-            println!("{s}");
+            let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
+            if records_neon {
+                // For Neon wal records, we can decode without spawning postgres, so do so.
+                let base_img = base_img.expect("Neon WAL redo requires base image").1;
+                let mut page = BytesMut::new();
+                page.extend_from_slice(&base_img);
+                for (_record_lsn, record) in records {
+                    apply_neon::apply_in_neon(&record, key, &mut page)?;
+                }
+                Ok(page.freeze())
+            } else {
+                // We never spawn a postgres walredo process in unit tests: just log what we might have done.
+                let s = format!(
+                    "redo for {} to get to {}, with {} and {} records",
+                    key,
+                    lsn,
+                    if base_img.is_some() {
+                        "base image"
+                    } else {
+                        "no base image"
+                    },
+                    records.len()
+                );
+                println!("{s}");

-            Ok(TEST_IMG(&s))
+                Ok(test_img(&s))
+            }
        }
    }
 }
@@ -4172,7 +4238,7 @@ mod tests {
            .put(
                *TEST_KEY,
                Lsn(0x10),
-                &Value::Image(TEST_IMG("foo at 0x10")),
+                &Value::Image(test_img("foo at 0x10")),
                &ctx,
            )
            .await?;
@@ -4184,7 +4250,7 @@ mod tests {
            .put(
                *TEST_KEY,
                Lsn(0x20),
-                &Value::Image(TEST_IMG("foo at 0x20")),
+                &Value::Image(test_img("foo at 0x20")),
                &ctx,
            )
            .await?;
@@ -4193,15 +4259,15 @@ mod tests {

        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
-            TEST_IMG("foo at 0x10")
+            test_img("foo at 0x10")
        );
        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
-            TEST_IMG("foo at 0x10")
+            test_img("foo at 0x10")
        );
        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
-            TEST_IMG("foo at 0x20")
+            test_img("foo at 0x20")
        );

        Ok(())
@@ -4310,7 +4376,6 @@ mod tests {
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut lsn = start_lsn;
-        #[allow(non_snake_case)]
        {
            let writer = tline.writer().await;
            // Create a relation on the timeline
@@ -4318,7 +4383,7 @@ mod tests {
                .put(
                    *TEST_KEY,
                    lsn,
-                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                    ctx,
                )
                .await?;
@@ -4328,7 +4393,7 @@ mod tests {
                .put(
                    *TEST_KEY,
                    lsn,
-                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                    ctx,
                )
                .await?;
@@ -4342,7 +4407,7 @@ mod tests {
                .put(
                    *TEST_KEY,
                    lsn,
-                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                    ctx,
                )
                .await?;
@@ -4352,7 +4417,7 @@ mod tests {
                .put(
                    *TEST_KEY,
                    lsn,
-                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                    ctx,
                )
                .await?;
@@ -4507,7 +4572,7 @@ mod tests {
        // Broken, as long as you don't need to access data from the parent.
        assert_eq!(
            newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?,
-            TEST_IMG(&format!("foo at {}", Lsn(0x70)))
+            test_img(&format!("foo at {}", Lsn(0x70)))
        );

        // This needs to traverse to the parent, and fails.
@@ -4584,7 +4649,7 @@ mod tests {
        // Check that the data is still accessible on the branch.
        assert_eq!(
            newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
-            TEST_IMG(&format!("foo at {}", Lsn(0x40)))
+            test_img(&format!("foo at {}", Lsn(0x40)))
        );

        Ok(())
@@ -4759,7 +4824,7 @@ mod tests {
            .put(
                *TEST_KEY,
                Lsn(0x10),
-                &Value::Image(TEST_IMG("foo at 0x10")),
+                &Value::Image(test_img("foo at 0x10")),
                &ctx,
            )
            .await?;
@@ -4776,7 +4841,7 @@ mod tests {
            .put(
                *TEST_KEY,
                Lsn(0x20),
-                &Value::Image(TEST_IMG("foo at 0x20")),
+                &Value::Image(test_img("foo at 0x20")),
                &ctx,
            )
            .await?;
@@ -4793,7 +4858,7 @@ mod tests {
            .put(
                *TEST_KEY,
                Lsn(0x30),
-                &Value::Image(TEST_IMG("foo at 0x30")),
+                &Value::Image(test_img("foo at 0x30")),
                &ctx,
            )
            .await?;
@@ -4810,7 +4875,7 @@ mod tests {
            .put(
                *TEST_KEY,
                Lsn(0x40),
-                &Value::Image(TEST_IMG("foo at 0x40")),
+                &Value::Image(test_img("foo at 0x40")),
                &ctx,
            )
            .await?;
@@ -4824,23 +4889,23 @@ mod tests {

        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
-            TEST_IMG("foo at 0x10")
+            test_img("foo at 0x10")
        );
        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
-            TEST_IMG("foo at 0x10")
+            test_img("foo at 0x10")
        );
        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
-            TEST_IMG("foo at 0x20")
+            test_img("foo at 0x20")
        );
        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?,
-            TEST_IMG("foo at 0x30")
+            test_img("foo at 0x30")
        );
        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?,
-            TEST_IMG("foo at 0x40")
+            test_img("foo at 0x40")
        );

        Ok(())
@@ -4867,7 +4932,7 @@ mod tests {
                    .put(
                        test_key,
                        lsn,
-                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                        ctx,
                    )
                    .await?;
@@ -5038,7 +5103,7 @@ mod tests {
                .put(
                    test_key,
                    lsn,
-                    &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                    &ctx,
                )
                .await?;
@@ -5059,7 +5124,7 @@ mod tests {
                    .put(
                        test_key,
                        lsn,
-                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                        &ctx,
                    )
                    .await?;
@@ -5073,7 +5138,7 @@ mod tests {
                test_key.field6 = blknum as u32;
                assert_eq!(
                    tline.get(test_key, lsn, &ctx).await?,
-                    TEST_IMG(&format!("{} at {}", blknum, last_lsn))
+                    test_img(&format!("{} at {}", blknum, last_lsn))
                );
            }

@@ -5127,7 +5192,7 @@ mod tests {
                .put(
                    test_key,
                    lsn,
-                    &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                    &ctx,
                )
                .await?;
@@ -5156,7 +5221,7 @@ mod tests {
                    .put(
                        test_key,
                        lsn,
-                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                        &ctx,
                    )
                    .await?;
@@ -5171,7 +5236,7 @@ mod tests {
                test_key.field6 = blknum as u32;
                assert_eq!(
                    tline.get(test_key, lsn, &ctx).await?,
-                    TEST_IMG(&format!("{} at {}", blknum, last_lsn))
+                    test_img(&format!("{} at {}", blknum, last_lsn))
                );
            }

@@ -5233,7 +5298,7 @@ mod tests {
                    .put(
                        test_key,
                        lsn,
-                        &Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
+                        &Value::Image(test_img(&format!("{} {} at {}", idx, blknum, lsn))),
                        &ctx,
                    )
                    .await?;
@@ -5255,7 +5320,7 @@ mod tests {
                test_key.field6 = blknum as u32;
                assert_eq!(
                    tline.get(test_key, *lsn, &ctx).await?,
-                    TEST_IMG(&format!("{idx} {blknum} at {lsn}"))
+                    test_img(&format!("{idx} {blknum} at {lsn}"))
                );
            }
        }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -11,6 +11,9 @@
 //! len <  128: 0XXXXXXX
 //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
+use bytes::{BufMut, BytesMut};
+use tokio_epoll_uring::{BoundedBuf, Slice};
+
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::tenant::block_io::BlockCursor;
@@ -100,6 +103,8 @@ pub struct BlobWriter<const BUFFERED: bool> {
    offset: u64,
    /// A buffer to save on write calls, only used if BUFFERED=true
    buf: Vec<u8>,
+    /// We do tiny writes for the length headers; they need to be in an owned buffer;
+    io_buf: Option<BytesMut>,
 }

 impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
@@ -108,6 +113,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
            inner,
            offset: start_offset,
            buf: Vec::with_capacity(Self::CAPACITY),
+            io_buf: Some(BytesMut::new()),
        }
    }

@@ -117,21 +123,31 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {

    const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };

-    #[inline(always)]
    /// Writes the given buffer directly to the underlying `VirtualFile`.
    /// You need to make sure that the internal buffer is empty, otherwise
    /// data will be written in wrong order.
-    async fn write_all_unbuffered(&mut self, src_buf: &[u8]) -> Result<(), Error> {
-        self.inner.write_all(src_buf).await?;
-        self.offset += src_buf.len() as u64;
-        Ok(())
+    #[inline(always)]
+    async fn write_all_unbuffered<B: BoundedBuf>(
+        &mut self,
+        src_buf: B,
+    ) -> (B::Buf, Result<(), Error>) {
+        let (src_buf, res) = self.inner.write_all(src_buf).await;
+        let nbytes = match res {
+            Ok(nbytes) => nbytes,
+            Err(e) => return (src_buf, Err(e)),
+        };
+        self.offset += nbytes as u64;
+        (src_buf, Ok(()))
    }

    #[inline(always)]
    /// Flushes the internal buffer to the underlying `VirtualFile`.
    pub async fn flush_buffer(&mut self) -> Result<(), Error> {
-        self.inner.write_all(&self.buf).await?;
-        self.buf.clear();
+        let buf = std::mem::take(&mut self.buf);
+        let (mut buf, res) = self.inner.write_all(buf).await;
+        res?;
+        buf.clear();
+        self.buf = buf;
        Ok(())
    }

@@ -146,62 +162,91 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
    }

    /// Internal, possibly buffered, write function
-    async fn write_all(&mut self, mut src_buf: &[u8]) -> Result<(), Error> {
+    async fn write_all<B: BoundedBuf>(&mut self, src_buf: B) -> (B::Buf, Result<(), Error>) {
        if !BUFFERED {
            assert!(self.buf.is_empty());
-            self.write_all_unbuffered(src_buf).await?;
-            return Ok(());
+            return self.write_all_unbuffered(src_buf).await;
        }
        let remaining = Self::CAPACITY - self.buf.len();
+        let src_buf_len = src_buf.bytes_init();
+        if src_buf_len == 0 {
+            return (Slice::into_inner(src_buf.slice_full()), Ok(()));
+        }
+        let mut src_buf = src_buf.slice(0..src_buf_len);
        // First try to copy as much as we can into the buffer
        if remaining > 0 {
-            let copied = self.write_into_buffer(src_buf);
-            src_buf = &src_buf[copied..];
+            let copied = self.write_into_buffer(&src_buf);
+            src_buf = src_buf.slice(copied..);
        }
        // Then, if the buffer is full, flush it out
        if self.buf.len() == Self::CAPACITY {
-            self.flush_buffer().await?;
+            if let Err(e) = self.flush_buffer().await {
+                return (Slice::into_inner(src_buf), Err(e));
+            }
        }
        // Finally, write the tail of src_buf:
        // If it wholly fits into the buffer without
        // completely filling it, then put it there.
        // If not, write it out directly.
-        if !src_buf.is_empty() {
+        let src_buf = if !src_buf.is_empty() {
            assert_eq!(self.buf.len(), 0);
            if src_buf.len() < Self::CAPACITY {
-                let copied = self.write_into_buffer(src_buf);
+                let copied = self.write_into_buffer(&src_buf);
                // We just verified above that src_buf fits into our internal buffer.
                assert_eq!(copied, src_buf.len());
+                Slice::into_inner(src_buf)
            } else {
-                self.write_all_unbuffered(src_buf).await?;
+                let (src_buf, res) = self.write_all_unbuffered(src_buf).await;
+                if let Err(e) = res {
+                    return (src_buf, Err(e));
+                }
+                src_buf
            }
-        }
-        Ok(())
+        } else {
+            Slice::into_inner(src_buf)
+        };
+        (src_buf, Ok(()))
    }

    /// Write a blob of data. Returns the offset that it was written to,
    /// which can be used to retrieve the data later.
-    pub async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
+    pub async fn write_blob<B: BoundedBuf>(&mut self, srcbuf: B) -> (B::Buf, Result<u64, Error>) {
        let offset = self.offset;

-        if srcbuf.len() < 128 {
-            // Short blob. Write a 1-byte length header
-            let len_buf = srcbuf.len() as u8;
-            self.write_all(&[len_buf]).await?;
-        } else {
-            // Write a 4-byte length header
-            if srcbuf.len() > 0x7fff_ffff {
-                return Err(Error::new(
-                    ErrorKind::Other,
-                    format!("blob too large ({} bytes)", srcbuf.len()),
-                ));
+        let len = srcbuf.bytes_init();
+
+        let mut io_buf = self.io_buf.take().expect("we always put it back below");
+        io_buf.clear();
+        let (io_buf, hdr_res) = async {
+            if len < 128 {
+                // Short blob. Write a 1-byte length header
+                io_buf.put_u8(len as u8);
+                self.write_all(io_buf).await
+            } else {
+                // Write a 4-byte length header
+                if len > 0x7fff_ffff {
+                    return (
+                        io_buf,
+                        Err(Error::new(
+                            ErrorKind::Other,
+                            format!("blob too large ({} bytes)", len),
+                        )),
+                    );
+                }
+                let mut len_buf = (len as u32).to_be_bytes();
+                len_buf[0] |= 0x80;
+                io_buf.extend_from_slice(&len_buf[..]);
+                self.write_all(io_buf).await
            }
-            let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
-            len_buf[0] |= 0x80;
-            self.write_all(&len_buf).await?;
        }
-        self.write_all(srcbuf).await?;
-        Ok(offset)
+        .await;
+        self.io_buf = Some(io_buf);
+        match hdr_res {
+            Ok(_) => (),
+            Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
+        }
+        let (srcbuf, res) = self.write_all(srcbuf).await;
+        (srcbuf, res.map(|_| offset))
    }
 }

@@ -248,12 +293,14 @@ mod tests {
            let file = VirtualFile::create(pathbuf.as_path()).await?;
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
-                let offs = wtr.write_blob(blob).await?;
+                let (_, res) = wtr.write_blob(blob.clone()).await;
+                let offs = res?;
                offsets.push(offs);
            }
            // Write out one page worth of zeros so that we can
            // read again with read_blk
-            let offs = wtr.write_blob(&vec![0; PAGE_SZ]).await?;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ]).await;
+            let offs = res?;
            println!("Writing final blob at offs={offs}");
            wtr.flush_buffer().await?;
        }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -3,10 +3,10 @@ use std::sync::Arc;
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::{models::TenantState, shard::TenantShardId};
-use remote_storage::{GenericRemoteStorage, RemotePath};
+use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, instrument, Instrument, Span};
+use tracing::{error, instrument, Instrument};

 use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};

@@ -84,17 +84,17 @@ async fn create_remote_delete_mark(
            let data = bytes::Bytes::from_static(data);
            let stream = futures::stream::once(futures::future::ready(Ok(data)));
            remote_storage
-                .upload(stream, 0, &remote_mark_path, None)
+                .upload(stream, 0, &remote_mark_path, None, cancel)
                .await
        },
-        |_e| false,
+        TimeoutOrCancel::caused_by_cancel,
        FAILED_UPLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        "mark_upload",
        cancel,
    )
    .await
-    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
    .and_then(|x| x)
    .context("mark_upload")?;

@@ -184,15 +184,15 @@ async fn remove_tenant_remote_delete_mark(
    if let Some(remote_storage) = remote_storage {
        let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
        backoff::retry(
-            || async { remote_storage.delete(&path).await },
-            |_e| false,
+            || async { remote_storage.delete(&path, cancel).await },
+            TimeoutOrCancel::caused_by_cancel,
            FAILED_UPLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "remove_tenant_remote_delete_mark",
            cancel,
        )
        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
        .and_then(|x| x)
        .context("remove_tenant_remote_delete_mark")?;
    }
@@ -496,11 +496,7 @@ impl DeleteTenantFlow {
                };
                Ok(())
            }
-            .instrument({
-                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
-                span.follows_from(Span::current());
-                span
-            }),
+            .instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
        );
    }

--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -36,7 +36,6 @@ use crate::{
 pub const VALUE_SZ: usize = 5;
 pub const MAX_VALUE: u64 = 0x007f_ffff_ffff;

-#[allow(dead_code)]
 pub const PAGE_SZ: usize = 8192;

 #[derive(Clone, Copy, Debug)]
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -6,6 +6,7 @@ use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::{self, VirtualFile};
+use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use std::cmp::min;
@@ -26,7 +27,10 @@ pub struct EphemeralFile {
    /// An ephemeral file is append-only.
    /// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
    /// The other pages, which can no longer be modified, are accessed through the page cache.
-    mutable_tail: [u8; PAGE_SZ],
+    ///
+    /// None <=> IO is ongoing.
+    /// Size is fixed to PAGE_SZ at creation time and must not be changed.
+    mutable_tail: Option<BytesMut>,
 }

 impl EphemeralFile {
@@ -60,7 +64,7 @@ impl EphemeralFile {
            _timeline_id: timeline_id,
            file,
            len: 0,
-            mutable_tail: [0u8; PAGE_SZ],
+            mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
        })
    }

@@ -103,7 +107,13 @@ impl EphemeralFile {
            };
        } else {
            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
-            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
+            Ok(BlockLease::EphemeralFileMutableTail(
+                self.mutable_tail
+                    .as_deref()
+                    .expect("we're not doing IO, it must be Some()")
+                    .try_into()
+                    .expect("we ensure that it's always PAGE_SZ"),
+            ))
        }
    }

@@ -135,21 +145,27 @@ impl EphemeralFile {
            ) -> Result<(), io::Error> {
                let mut src_remaining = src;
                while !src_remaining.is_empty() {
-                    let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
+                    let dst_remaining = &mut self
+                        .ephemeral_file
+                        .mutable_tail
+                        .as_deref_mut()
+                        .expect("IO is not yet ongoing")[self.off..];
                    let n = min(dst_remaining.len(), src_remaining.len());
                    dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
                    self.off += n;
                    src_remaining = &src_remaining[n..];
                    if self.off == PAGE_SZ {
-                        match self
+                        let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
+                            .expect("IO is not yet ongoing");
+                        let (mutable_tail, res) = self
                            .ephemeral_file
                            .file
-                            .write_all_at(
-                                &self.ephemeral_file.mutable_tail,
-                                self.blknum as u64 * PAGE_SZ as u64,
-                            )
-                            .await
-                        {
+                            .write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
+                            .await;
+                        // TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
+                        // I.e., the IO isn't retryable if we panic.
+                        self.ephemeral_file.mutable_tail = Some(mutable_tail);
+                        match res {
                            Ok(_) => {
                                // Pre-warm the page cache with what we just wrote.
                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
@@ -169,7 +185,12 @@ impl EphemeralFile {
                                    Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
                                        let buf: &mut [u8] = write_guard.deref_mut();
                                        debug_assert_eq!(buf.len(), PAGE_SZ);
-                                        buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
+                                        buf.copy_from_slice(
+                                            self.ephemeral_file
+                                                .mutable_tail
+                                                .as_deref()
+                                                .expect("IO is not ongoing"),
+                                        );
                                        let _ = write_guard.mark_valid();
                                        // pre-warm successful
                                    }
@@ -181,7 +202,11 @@ impl EphemeralFile {
                                // Zero the buffer for re-use.
                                // Zeroing is critical for correcntess because the write_blob code below
                                // and similarly read_blk expect zeroed pages.
-                                self.ephemeral_file.mutable_tail.fill(0);
+                                self.ephemeral_file
+                                    .mutable_tail
+                                    .as_deref_mut()
+                                    .expect("IO is not ongoing")
+                                    .fill(0);
                                // This block is done, move to next one.
                                self.blknum += 1;
                                self.off = 0;
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -279,7 +279,7 @@ pub async fn save_metadata(
    let path = conf.metadata_path(tenant_shard_id, timeline_id);
    let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
    let metadata_bytes = data.to_bytes().context("serialize metadata")?;
-    VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
+    VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
        .await
        .context("write metadata")?;
    Ok(())
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,6 +2,8 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
+use futures::stream::StreamExt;
+use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
@@ -22,7 +24,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;

 use remote_storage::GenericRemoteStorage;
-use utils::crashsafe;
+use utils::{completion, crashsafe};

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -644,8 +646,6 @@ pub(crate) async fn shutdown_all_tenants() {
 }

 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
-    use utils::completion;
-
    let mut join_set = JoinSet::new();

    // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
@@ -1200,7 +1200,7 @@ impl TenantManager {
        &self,
        tenant_shard_id: TenantShardId,
        drop_cache: bool,
-        ctx: RequestContext,
+        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
        let Some(old_slot) = slot_guard.get_old_value() else {
@@ -1253,7 +1253,7 @@ impl TenantManager {
            None,
            self.tenants,
            SpawnMode::Normal,
-            &ctx,
+            ctx,
        )?;

        slot_guard.upsert(TenantSlot::Attached(tenant))?;
@@ -1375,6 +1375,309 @@ impl TenantManager {
        slot_guard.revert();
        result
    }
+
+    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
+    pub(crate) async fn shard_split(
+        &self,
+        tenant_shard_id: TenantShardId,
+        new_shard_count: ShardCount,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<TenantShardId>> {
+        let tenant = get_tenant(tenant_shard_id, true)?;
+
+        // Plan: identify what the new child shards will be
+        let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
+        if new_shard_count <= ShardCount(effective_old_shard_count) {
+            anyhow::bail!("Requested shard count is not an increase");
+        }
+        let expansion_factor = new_shard_count.0 / effective_old_shard_count;
+        if !expansion_factor.is_power_of_two() {
+            anyhow::bail!("Requested split is not a power of two");
+        }
+
+        let parent_shard_identity = tenant.shard_identity;
+        let parent_tenant_conf = tenant.get_tenant_conf();
+        let parent_generation = tenant.generation;
+
+        let child_shards = tenant_shard_id.split(new_shard_count);
+        tracing::info!(
+            "Shard {} splits into: {}",
+            tenant_shard_id.to_index(),
+            child_shards
+                .iter()
+                .map(|id| format!("{}", id.to_index()))
+                .join(",")
+        );
+
+        // Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
+        if let Err(e) = tenant.split_prepare(&child_shards).await {
+            // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
+            // have been left in a partially-shut-down state.
+            tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning");
+            self.reset_tenant(tenant_shard_id, false, ctx).await?;
+            return Err(e);
+        }
+
+        self.resources.deletion_queue_client.flush_advisory();
+
+        // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
+        drop(tenant);
+        let mut parent_slot_guard =
+            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+        let parent = match parent_slot_guard.get_old_value() {
+            Some(TenantSlot::Attached(t)) => t,
+            Some(TenantSlot::Secondary(_)) => anyhow::bail!("Tenant location in secondary mode"),
+            Some(TenantSlot::InProgress(_)) => {
+                // tenant_map_acquire_slot never returns InProgress, if a slot was InProgress
+                // it would return an error.
+                unreachable!()
+            }
+            None => {
+                // We don't actually need the parent shard to still be attached to do our work, but it's
+                // a weird enough situation that the caller probably didn't want us to continue working
+                // if they had detached the tenant they requested the split on.
+                anyhow::bail!("Detached parent shard in the middle of split!")
+            }
+        };
+
+        // Optimization: hardlink layers from the parent into the children, so that they don't have to
+        // re-download & duplicate the data referenced in their initial IndexPart
+        self.shard_split_hardlink(parent, child_shards.clone())
+            .await?;
+
+        // Take a snapshot of where the parent's WAL ingest had got to: we will wait for
+        // child shards to reach this point.
+        let mut target_lsns = HashMap::new();
+        for timeline in parent.timelines.lock().unwrap().clone().values() {
+            target_lsns.insert(timeline.timeline_id, timeline.get_last_record_lsn());
+        }
+
+        // TODO: we should have the parent shard stop its WAL ingest here, it's a waste of resources
+        // and could slow down the children trying to catch up.
+
+        // Phase 3: Spawn the child shards
+        for child_shard in &child_shards {
+            let mut child_shard_identity = parent_shard_identity;
+            child_shard_identity.count = child_shard.shard_count;
+            child_shard_identity.number = child_shard.shard_number;
+
+            let child_location_conf = LocationConf {
+                mode: LocationMode::Attached(AttachedLocationConfig {
+                    generation: parent_generation,
+                    attach_mode: AttachmentMode::Single,
+                }),
+                shard: child_shard_identity,
+                tenant_conf: parent_tenant_conf,
+            };
+
+            self.upsert_location(
+                *child_shard,
+                child_location_conf,
+                None,
+                SpawnMode::Normal,
+                ctx,
+            )
+            .await?;
+        }
+
+        // Phase 4: wait for child chards WAL ingest to catch up to target LSN
+        for child_shard_id in &child_shards {
+            let child_shard_id = *child_shard_id;
+            let child_shard = {
+                let locked = TENANTS.read().unwrap();
+                let peek_slot =
+                    tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?;
+                peek_slot.and_then(|s| s.get_attached()).cloned()
+            };
+            if let Some(t) = child_shard {
+                let timelines = t.timelines.lock().unwrap().clone();
+                for timeline in timelines.values() {
+                    let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {
+                        continue;
+                    };
+
+                    tracing::info!(
+                        "Waiting for child shard {}/{} to reach target lsn {}...",
+                        child_shard_id,
+                        timeline.timeline_id,
+                        target_lsn
+                    );
+                    if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
+                        // Failure here might mean shutdown, in any case this part is an optimization
+                        // and we shouldn't hold up the split operation.
+                        tracing::warn!(
+                            "Failed to wait for timeline {} to reach lsn {target_lsn}: {e}",
+                            timeline.timeline_id
+                        );
+                    } else {
+                        tracing::info!(
+                            "Child shard {}/{} reached target lsn {}",
+                            child_shard_id,
+                            timeline.timeline_id,
+                            target_lsn
+                        );
+                    }
+                }
+            }
+        }
+
+        // Phase 5: Shut down the parent shard, and erase it from disk
+        let (_guard, progress) = completion::channel();
+        match parent.shutdown(progress, false).await {
+            Ok(()) => {}
+            Err(other) => {
+                other.wait().await;
+            }
+        }
+        let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
+        let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
+            .await
+            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MgmtRequest,
+            None,
+            None,
+            "tenant_files_delete",
+            false,
+            async move {
+                fs::remove_dir_all(tmp_path.as_path())
+                    .await
+                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+            },
+        );
+
+        parent_slot_guard.drop_old_value()?;
+
+        // Phase 6: Release the InProgress on the parent shard
+        drop(parent_slot_guard);
+
+        Ok(child_shards)
+    }
+
+    /// Part of [`Self::shard_split`]: hard link parent shard layers into child shards, as an optimization
+    /// to avoid the children downloading them again.
+    ///
+    /// For each resident layer in the parent shard, we will hard link it into all of the child shards.
+    async fn shard_split_hardlink(
+        &self,
+        parent_shard: &Tenant,
+        child_shards: Vec<TenantShardId>,
+    ) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_id();
+
+        let parent_path = self.conf.tenant_path(parent_shard.get_tenant_shard_id());
+        let (parent_timelines, parent_layers) = {
+            let mut parent_layers = Vec::new();
+            let timelines = parent_shard.timelines.lock().unwrap().clone();
+            let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
+            for timeline in timelines.values() {
+                let timeline_layers = timeline
+                    .layers
+                    .read()
+                    .await
+                    .resident_layers()
+                    .collect::<Vec<_>>()
+                    .await;
+                for layer in timeline_layers {
+                    let relative_path = layer
+                        .local_path()
+                        .strip_prefix(&parent_path)
+                        .context("Removing prefix from parent layer path")?;
+                    parent_layers.push(relative_path.to_owned());
+                }
+            }
+            debug_assert!(
+                !parent_layers.is_empty(),
+                "shutdown cannot empty the layermap"
+            );
+            (parent_timelines, parent_layers)
+        };
+
+        let mut child_prefixes = Vec::new();
+        let mut create_dirs = Vec::new();
+
+        for child in child_shards {
+            let child_prefix = self.conf.tenant_path(&child);
+            create_dirs.push(child_prefix.clone());
+            create_dirs.extend(
+                parent_timelines
+                    .iter()
+                    .map(|t| self.conf.timeline_path(&child, t)),
+            );
+
+            child_prefixes.push(child_prefix);
+        }
+
+        // Since we will do a large number of small filesystem metadata operations, batch them into
+        // spawn_blocking calls rather than doing each one as a tokio::fs round-trip.
+        let jh = tokio::task::spawn_blocking(move || -> anyhow::Result<usize> {
+            for dir in &create_dirs {
+                if let Err(e) = std::fs::create_dir_all(dir) {
+                    // Ignore AlreadyExists errors, drop out on all other errors
+                    match e.kind() {
+                        std::io::ErrorKind::AlreadyExists => {}
+                        _ => {
+                            return Err(anyhow::anyhow!(e).context(format!("Creating {dir}")));
+                        }
+                    }
+                }
+            }
+
+            for child_prefix in child_prefixes {
+                for relative_layer in &parent_layers {
+                    let parent_path = parent_path.join(relative_layer);
+                    let child_path = child_prefix.join(relative_layer);
+                    if let Err(e) = std::fs::hard_link(&parent_path, &child_path) {
+                        match e.kind() {
+                            std::io::ErrorKind::AlreadyExists => {}
+                            std::io::ErrorKind::NotFound => {
+                                tracing::info!(
+                                    "Layer {} not found during hard-linking, evicted during split?",
+                                    relative_layer
+                                );
+                            }
+                            _ => {
+                                return Err(anyhow::anyhow!(e).context(format!(
+                                    "Hard linking {relative_layer} into {child_prefix}"
+                                )))
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Durability is not required for correctness, but if we crashed during split and
+            // then came restarted with empty timeline dirs, it would be very inefficient to
+            // re-populate from remote storage.
+            for dir in create_dirs {
+                if let Err(e) = crashsafe::fsync(&dir) {
+                    // Something removed a newly created timeline dir out from underneath us?  Extremely
+                    // unexpected, but not worth panic'ing over as this whole function is just an
+                    // optimization.
+                    tracing::warn!("Failed to fsync directory {dir}: {e}")
+                }
+            }
+
+            Ok(parent_layers.len())
+        });
+
+        match jh.await {
+            Ok(Ok(layer_count)) => {
+                tracing::info!(count = layer_count, "Hard linked layers into child shards");
+            }
+            Ok(Err(e)) => {
+                // This is an optimization, so we tolerate failure.
+                tracing::warn!("Error hard-linking layers, proceeding anyway: {e}")
+            }
+            Err(e) => {
+                // This is something totally unexpected like a panic, so bail out.
+                anyhow::bail!("Error joining hard linking task: {e}");
+            }
+        }
+
+        Ok(())
+    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -2209,8 +2512,6 @@ async fn remove_tenant_from_memory<V, F>(
 where
    F: std::future::Future<Output = anyhow::Result<V>>,
 {
-    use utils::completion;
-
    let mut slot_guard =
        tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -196,14 +196,12 @@ pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
-use utils::timeout::{timeout_cancellable, TimeoutCancellableError};

 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
-use std::time::Duration;

-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use std::ops::DerefMut;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
@@ -217,6 +215,7 @@ use crate::metrics::{
 };
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::remote_timeline_client::download::download_retry;
 use crate::tenant::storage_layer::AsLayerDesc;
 use crate::tenant::upload_queue::Delete;
 use crate::tenant::TIMELINES_SEGMENT_NAME;
@@ -325,45 +324,6 @@ pub struct RemoteTimelineClient {
    cancel: CancellationToken,
 }

-/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
-/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
-const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-
-/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
-///
-/// This is a convenience for the various upload functions.  In future
-/// the anyhow::Error result should be replaced with a more structured type that
-/// enables callers to avoid handling shutdown as an error.
-async fn upload_cancellable<F>(cancel: &CancellationToken, future: F) -> anyhow::Result<()>
-where
-    F: std::future::Future<Output = anyhow::Result<()>>,
-{
-    match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await {
-        Ok(Ok(())) => Ok(()),
-        Ok(Err(e)) => Err(e),
-        Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")),
-        Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")),
-    }
-}
-/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError.
-async fn download_cancellable<F, R>(
-    cancel: &CancellationToken,
-    future: F,
-) -> Result<R, DownloadError>
-where
-    F: std::future::Future<Output = Result<R, DownloadError>>,
-{
-    match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await {
-        Ok(Ok(r)) => Ok(r),
-        Ok(Err(e)) => Err(e),
-        Err(TimeoutCancellableError::Timeout) => {
-            Err(DownloadError::Other(anyhow::anyhow!("Timed out")))
-        }
-        Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled),
-    }
-}
-
 impl RemoteTimelineClient {
    ///
    /// Create a remote storage client for given timeline
@@ -506,7 +466,7 @@ impl RemoteTimelineClient {
    /// Download index file
    pub async fn download_index_file(
        &self,
-        cancel: CancellationToken,
+        cancel: &CancellationToken,
    ) -> Result<MaybeDeletedIndexPart, DownloadError> {
        let _unfinished_gauge_guard = self.metrics.call_begin(
            &RemoteOpFileKind::Index,
@@ -1049,7 +1009,7 @@ impl RemoteTimelineClient {
            &self.cancel,
        )
        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
        .and_then(|x| x)?;

        // all good, disarm the guard and mark as success
@@ -1081,14 +1041,14 @@ impl RemoteTimelineClient {
                upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel)
                    .await
            },
-            |_e| false,
+            TimeoutOrCancel::caused_by_cancel,
            FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "preserve_initdb_tar_zst",
            &cancel.clone(),
        )
        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancellled"))
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
        .and_then(|x| x)
        .context("backing up initdb archive")?;
        Ok(())
@@ -1147,22 +1107,17 @@ impl RemoteTimelineClient {

        let cancel = shutdown_token();

-        let remaining = backoff::retry(
+        let remaining = download_retry(
            || async {
                self.storage_impl
-                    .list_files(Some(&timeline_storage_path))
+                    .list_files(Some(&timeline_storage_path), None, &cancel)
                    .await
            },
-            |_e| false,
-            FAILED_DOWNLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "list_prefixes",
+            "list remaining files",
            &cancel,
        )
        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled!"))
-        .and_then(|x| x)
-        .context("list prefixes")?;
+        .context("list files remaining files")?;

        // We will delete the current index_part object last, since it acts as a deletion
        // marker via its deleted_at attribute
@@ -1351,6 +1306,7 @@ impl RemoteTimelineClient {
    /// queue.
    ///
    async fn perform_upload_task(self: &Arc<Self>, task: Arc<UploadTask>) {
+        let cancel = shutdown_token();
        // Loop to retry until it completes.
        loop {
            // If we're requested to shut down, close up shop and exit.
@@ -1362,7 +1318,7 @@ impl RemoteTimelineClient {
            // the Future, but we're not 100% sure if the remote storage library
            // is cancellation safe, so we don't dare to do that. Hopefully, the
            // upload finishes or times out soon enough.
-            if task_mgr::is_shutdown_requested() {
+            if cancel.is_cancelled() {
                info!("upload task cancelled by shutdown request");
                match self.stop() {
                    Ok(()) => {}
@@ -1448,6 +1404,10 @@ impl RemoteTimelineClient {
                Ok(()) => {
                    break;
                }
+                Err(e) if TimeoutOrCancel::caused_by_cancel(&e) => {
+                    // loop around to do the proper stopping
+                    continue;
+                }
                Err(e) => {
                    let retries = task.retries.fetch_add(1, Ordering::SeqCst);

@@ -1473,7 +1433,7 @@ impl RemoteTimelineClient {
                        retries,
                        DEFAULT_BASE_BACKOFF_SECONDS,
                        DEFAULT_MAX_BACKOFF_SECONDS,
-                        &shutdown_token(),
+                        &cancel,
                    )
                    .await;
                }
@@ -1703,23 +1663,6 @@ impl RemoteTimelineClient {
            }
        }
    }
-
-    pub(crate) fn get_layers_metadata(
-        &self,
-        layers: Vec<LayerFileName>,
-    ) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
-        let q = self.upload_queue.lock().unwrap();
-        let q = match &*q {
-            UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
-                anyhow::bail!("queue is in state {}", q.as_str())
-            }
-            UploadQueue::Initialized(inner) => inner,
-        };
-
-        let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
-
-        Ok(decorated.collect())
-    }
 }

 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
@@ -1990,7 +1933,7 @@ mod tests {

        // Download back the index.json, and check that the list of files is correct
        let initial_index_part = match client
-            .download_index_file(CancellationToken::new())
+            .download_index_file(&CancellationToken::new())
            .await
            .unwrap()
        {
@@ -2084,7 +2027,7 @@ mod tests {

        // Download back the index.json, and check that the list of files is correct
        let index_part = match client
-            .download_index_file(CancellationToken::new())
+            .download_index_file(&CancellationToken::new())
            .await
            .unwrap()
        {
@@ -2286,7 +2229,7 @@ mod tests {
        let client = test_state.build_client(get_generation);

        let download_r = client
-            .download_index_file(CancellationToken::new())
+            .download_index_file(&CancellationToken::new())
            .await
            .expect("download should always succeed");
        assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -11,16 +11,14 @@ use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::shard::TenantShardId;
 use tokio::fs::{self, File, OpenOptions};
 use tokio::io::{AsyncSeekExt, AsyncWriteExt};
+use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
-use utils::timeout::timeout_cancellable;
 use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::remote_timeline_client::{
-    download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
-};
+use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::Generation;
 use crate::virtual_file::on_fatal_io_error;
@@ -83,15 +81,13 @@ pub async fn download_layer_file<'a>(
                .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                .map_err(DownloadError::Other)?;

-            // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
-            // file: the write to local file doesn't start until after the request header is returned
-            // and we start draining the body stream below
-            let download = download_cancellable(cancel, storage.download(&remote_path))
+            let download = storage
+                .download(&remote_path, cancel)
                .await
                .with_context(|| {
                    format!(
-                    "open a download stream for layer with remote storage path '{remote_path:?}'"
-                )
+                        "open a download stream for layer with remote storage path '{remote_path:?}'"
+                    )
                })
                .map_err(DownloadError::Other)?;

@@ -100,43 +96,26 @@ pub async fn download_layer_file<'a>(

            let mut reader = tokio_util::io::StreamReader::new(download.download_stream);

-            // Cancellation safety: it is safe to cancel this future because it is writing into a temporary file,
-            // and we will unlink the temporary file if there is an error.  This unlink is important because we
-            // are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that
-            // we will imminiently try and write to again.
-            let bytes_amount: u64 = match timeout_cancellable(
-                DOWNLOAD_TIMEOUT,
-                cancel,
-                tokio::io::copy_buf(&mut reader, &mut destination_file),
-            )
-            .await
-            .with_context(|| {
-                format!(
+            let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file)
+                .await
+                .with_context(|| format!(
                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
-                )
-            })
-            .map_err(DownloadError::Other)?
-            {
-                Ok(b) => Ok(b),
+                ))
+                .map_err(DownloadError::Other);
+
+            match bytes_amount {
+                Ok(bytes_amount) => {
+                    let destination_file = destination_file.into_inner();
+                    Ok((destination_file, bytes_amount))
+                }
                Err(e) => {
-                    // Remove incomplete files: on restart Timeline would do this anyway, but we must
-                    // do it here for the retry case.
                    if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
                        on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
                    }
+
                    Err(e)
                }
            }
-            .with_context(|| {
-                format!(
-                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
-                )
-            })
-            .map_err(DownloadError::Other)?;
-
-            let destination_file = destination_file.into_inner();
-
-            Ok((destination_file, bytes_amount))
        },
        &format!("download {remote_path:?}"),
        cancel,
@@ -216,16 +195,17 @@ pub async fn list_remote_timelines(
        anyhow::bail!("storage-sync-list-remote-timelines");
    });

-    let cancel_inner = cancel.clone();
    let listing = download_retry_forever(
        || {
-            download_cancellable(
-                &cancel_inner,
-                storage.list(Some(&remote_path), ListingMode::WithDelimiter),
+            storage.list(
+                Some(&remote_path),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
            )
        },
        &format!("list timelines for {tenant_shard_id}"),
-        cancel,
+        &cancel,
    )
    .await?;

@@ -258,29 +238,25 @@ async fn do_download_index_part(
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    index_generation: Generation,
-    cancel: CancellationToken,
+    cancel: &CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
-    use futures::stream::StreamExt;
-
    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);

-    let cancel_inner = cancel.clone();
    let index_part_bytes = download_retry_forever(
        || async {
-            // Cancellation: if is safe to cancel this future because we're just downloading into
-            // a memory buffer, not touching local disk.
-            let index_part_download =
-                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
+            let download = storage.download(&remote_path, cancel).await?;

-            let mut index_part_bytes = Vec::new();
-            let mut stream = std::pin::pin!(index_part_download.download_stream);
-            while let Some(chunk) = stream.next().await {
-                let chunk = chunk
-                    .with_context(|| format!("download index part at {remote_path:?}"))
-                    .map_err(DownloadError::Other)?;
-                index_part_bytes.extend_from_slice(&chunk[..]);
-            }
-            Ok(index_part_bytes)
+            let mut bytes = Vec::new();
+
+            let stream = download.download_stream;
+            let mut stream = StreamReader::new(stream);
+
+            tokio::io::copy_buf(&mut stream, &mut bytes)
+                .await
+                .with_context(|| format!("download index part at {remote_path:?}"))
+                .map_err(DownloadError::Other)?;
+
+            Ok(bytes)
        },
        &format!("download {remote_path:?}"),
        cancel,
@@ -288,7 +264,7 @@ async fn do_download_index_part(
    .await?;

    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
-        .with_context(|| format!("download index part file at {remote_path:?}"))
+        .with_context(|| format!("deserialize index part file at {remote_path:?}"))
        .map_err(DownloadError::Other)?;

    Ok(index_part)
@@ -305,7 +281,7 @@ pub(super) async fn download_index_part(
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    my_generation: Generation,
-    cancel: CancellationToken,
+    cancel: &CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

@@ -325,14 +301,8 @@ pub(super) async fn download_index_part(
    // index in our generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(
-        storage,
-        tenant_shard_id,
-        timeline_id,
-        my_generation,
-        cancel.clone(),
-    )
-    .await;
+    let res =
+        do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
    match res {
        Ok(index_part) => {
            tracing::debug!(
@@ -357,7 +327,7 @@ pub(super) async fn download_index_part(
        tenant_shard_id,
        timeline_id,
        my_generation.previous(),
-        cancel.clone(),
+        cancel,
    )
    .await;
    match res {
@@ -379,18 +349,13 @@ pub(super) async fn download_index_part(
    // objects, and select the highest one with a generation <= my_generation.  Constructing the prefix is equivalent
    // to constructing a full index path with no generation, because the generation is a suffix.
    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
-    let indices = backoff::retry(
-        || async { storage.list_files(Some(&index_prefix)).await },
-        |_| false,
-        FAILED_DOWNLOAD_WARN_THRESHOLD,
-        FAILED_REMOTE_OP_RETRIES,
-        "listing index_part files",
-        &cancel,
+
+    let indices = download_retry(
+        || async { storage.list_files(Some(&index_prefix), None, cancel).await },
+        "list index_part files",
+        cancel,
    )
-    .await
-    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
-    .and_then(|x| x)
-    .map_err(DownloadError::Other)?;
+    .await?;

    // General case logic for which index to use: the latest index whose generation
    // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
@@ -447,8 +412,6 @@ pub(crate) async fn download_initdb_tar_zst(
        "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
    ));

-    let cancel_inner = cancel.clone();
-
    let file = download_retry(
        || async {
            let file = OpenOptions::new()
@@ -461,13 +424,10 @@ pub(crate) async fn download_initdb_tar_zst(
                .with_context(|| format!("tempfile creation {temp_path}"))
                .map_err(DownloadError::Other)?;

-            let download = match download_cancellable(&cancel_inner, storage.download(&remote_path))
-                .await
-            {
+            let download = match storage.download(&remote_path, cancel).await {
                Ok(dl) => dl,
                Err(DownloadError::NotFound) => {
-                    download_cancellable(&cancel_inner, storage.download(&remote_preserved_path))
-                        .await?
+                    storage.download(&remote_preserved_path, cancel).await?
                }
                Err(other) => Err(other)?,
            };
@@ -477,6 +437,7 @@ pub(crate) async fn download_initdb_tar_zst(
            // TODO: this consumption of the response body should be subject to timeout + cancellation, but
            // not without thinking carefully about how to recover safely from cancelling a write to
            // local storage (e.g. by writing into a temp file as we do in download_layer)
+            // FIXME: flip the weird error wrapping
            tokio::io::copy_buf(&mut download, &mut writer)
                .await
                .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
@@ -516,7 +477,7 @@ pub(crate) async fn download_initdb_tar_zst(
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-async fn download_retry<T, O, F>(
+pub(super) async fn download_retry<T, O, F>(
    op: O,
    description: &str,
    cancel: &CancellationToken,
@@ -527,7 +488,7 @@ where
 {
    backoff::retry(
        op,
-        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
+        DownloadError::is_permanent,
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        description,
@@ -541,7 +502,7 @@ where
 async fn download_retry_forever<T, O, F>(
    op: O,
    description: &str,
-    cancel: CancellationToken,
+    cancel: &CancellationToken,
 ) -> Result<T, DownloadError>
 where
    O: FnMut() -> F,
@@ -549,11 +510,11 @@ where
 {
    backoff::retry(
        op,
-        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
+        DownloadError::is_permanent,
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        u32::MAX,
        description,
-        &cancel,
+        cancel,
    )
    .await
    .ok_or_else(|| DownloadError::Cancelled)
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -16,7 +16,7 @@ use crate::{
    config::PageServerConf,
    tenant::remote_timeline_client::{
        index::IndexPart, remote_index_path, remote_initdb_archive_path,
-        remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
+        remote_initdb_preserved_archive_path, remote_path,
    },
 };
 use remote_storage::{GenericRemoteStorage, TimeTravelError};
@@ -27,7 +27,7 @@ use super::index::LayerFileMetadata;
 use tracing::info;

 /// Serializes and uploads the given index part data to the remote storage.
-pub(super) async fn upload_index_part<'a>(
+pub(crate) async fn upload_index_part<'a>(
    storage: &'a GenericRemoteStorage,
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
@@ -49,16 +49,15 @@ pub(super) async fn upload_index_part<'a>(
    let index_part_bytes = bytes::Bytes::from(index_part_bytes);

    let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
-    upload_cancellable(
-        cancel,
-        storage.upload_storage_object(
+    storage
+        .upload_storage_object(
            futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
            index_part_size,
            &remote_path,
-        ),
-    )
-    .await
-    .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
+            cancel,
+        )
+        .await
+        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }

 /// Attempts to upload given layer files.
@@ -115,11 +114,10 @@ pub(super) async fn upload_timeline_layer<'a>(

    let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);

-    upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None))
+    storage
+        .upload(reader, fs_size, &storage_path, None, cancel)
        .await
-        .with_context(|| format!("upload layer from local path '{source_path}'"))?;
-
-    Ok(())
+        .with_context(|| format!("upload layer from local path '{source_path}'"))
 }

 /// Uploads the given `initdb` data to the remote storage.
@@ -139,12 +137,10 @@ pub(crate) async fn upload_initdb_dir(
    let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);

    let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
-    upload_cancellable(
-        cancel,
-        storage.upload_storage_object(file, size as usize, &remote_path),
-    )
-    .await
-    .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
+    storage
+        .upload_storage_object(file, size as usize, &remote_path, cancel)
+        .await
+        .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
 }

 pub(crate) async fn preserve_initdb_archive(
@@ -155,7 +151,8 @@ pub(crate) async fn preserve_initdb_archive(
 ) -> anyhow::Result<()> {
    let source_path = remote_initdb_archive_path(tenant_id, timeline_id);
    let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id);
-    upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path))
+    storage
+        .copy_object(&source_path, &dest_path, cancel)
        .await
        .with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
 }
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -160,7 +160,7 @@ impl SecondaryTenant {
        &self.tenant_shard_id
    }

-    pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> DiskUsageEvictionInfo {
+    pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> (DiskUsageEvictionInfo, usize) {
        self.detail.lock().unwrap().get_layers_for_eviction(self)
    }

--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -146,14 +146,15 @@ impl SecondaryDetail {
        }
    }

+    /// Additionally returns the total number of layers, used for more stable relative access time
+    /// based eviction.
    pub(super) fn get_layers_for_eviction(
        &self,
        parent: &Arc<SecondaryTenant>,
-    ) -> DiskUsageEvictionInfo {
-        let mut result = DiskUsageEvictionInfo {
-            max_layer_size: None,
-            resident_layers: Vec::new(),
-        };
+    ) -> (DiskUsageEvictionInfo, usize) {
+        let mut result = DiskUsageEvictionInfo::default();
+        let mut total_layers = 0;
+
        for (timeline_id, timeline_detail) in &self.timelines {
            result
                .resident_layers
@@ -169,6 +170,10 @@ impl SecondaryDetail {
                        relative_last_activity: finite_f32::FiniteF32::ZERO,
                    }
                }));
+
+            // total might be missing currently downloading layers, but as a lower than actual
+            // value it is good enough approximation.
+            total_layers += timeline_detail.on_disk_layers.len() + timeline_detail.evicted_at.len();
        }
        result.max_layer_size = result
            .resident_layers
@@ -183,7 +188,7 @@ impl SecondaryDetail {
            result.resident_layers.len()
        );

-        result
+        (result, total_layers)
    }
 }

@@ -312,9 +317,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
            .tenant_manager
            .get_secondary_tenant_shard(*tenant_shard_id);
        let Some(tenant) = tenant else {
-            {
-                return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
-            }
+            return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
        };

        Ok(PendingDownload {
@@ -389,9 +392,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
            }

            CompleteDownload {
-                    secondary_state,
-                    completed_at: Instant::now(),
-                }
+                secondary_state,
+                completed_at: Instant::now(),
+            }
        }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
    }
 }
@@ -483,7 +486,7 @@ impl<'a> TenantDownloader<'a> {
        let heatmap_path_bg = heatmap_path.clone();
        tokio::task::spawn_blocking(move || {
            tokio::runtime::Handle::current().block_on(async move {
-                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
+                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, heatmap_bytes).await
            })
        })
        .await
@@ -520,24 +523,25 @@ impl<'a> TenantDownloader<'a> {
        tracing::debug!("Downloading heatmap for secondary tenant",);

        let heatmap_path = remote_heatmap_path(tenant_shard_id);
+        let cancel = &self.secondary_state.cancel;

        let heatmap_bytes = backoff::retry(
            || async {
                let download = self
                    .remote_storage
-                    .download(&heatmap_path)
+                    .download(&heatmap_path, cancel)
                    .await
                    .map_err(UpdateError::from)?;
                let mut heatmap_bytes = Vec::new();
                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
-                let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
+                let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
                Ok(heatmap_bytes)
            },
            |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
            FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "download heatmap",
-            &self.secondary_state.cancel,
+            cancel,
        )
        .await
        .ok_or_else(|| UpdateError::Cancelled)
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -21,18 +21,17 @@ use futures::Future;
 use md5;
 use pageserver_api::shard::TenantShardId;
 use rand::Rng;
-use remote_storage::GenericRemoteStorage;
+use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};

 use super::{
+    heatmap::HeatMapTenant,
    scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
-    CommandRequest,
+    CommandRequest, UploadCommand,
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, Instrument};
 use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};

-use super::{heatmap::HeatMapTenant, UploadCommand};
-
 pub(super) async fn heatmap_uploader_task(
    tenant_manager: Arc<TenantManager>,
    remote_storage: GenericRemoteStorage,
@@ -417,10 +416,10 @@ async fn upload_tenant_heatmap(
        || async {
            let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
            remote_storage
-                .upload_storage_object(bytes, size, &path)
+                .upload_storage_object(bytes, size, &path, cancel)
                .await
        },
-        |_| false,
+        TimeoutOrCancel::caused_by_cancel,
        3,
        u32::MAX,
        "Uploading heatmap",
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -539,6 +539,12 @@ impl LayerAccessStats {
        ret
    }

+    /// Get the latest access timestamp, falling back to latest residence event, further falling
+    /// back to `SystemTime::now` for a usable timestamp for eviction.
+    pub(crate) fn latest_activity_or_now(&self) -> SystemTime {
+        self.latest_activity().unwrap_or_else(SystemTime::now)
+    }
+
    /// Get the latest access timestamp, falling back to latest residence event.
    ///
    /// This function can only return `None` if there has not yet been a call to the
@@ -553,7 +559,7 @@ impl LayerAccessStats {
    /// that that type can only be produced by inserting into the layer map.
    ///
    /// [`record_residence_event`]: Self::record_residence_event
-    pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
+    fn latest_activity(&self) -> Option<SystemTime> {
        let locked = self.0.lock().unwrap();
        let inner = &locked.for_eviction_policy;
        match inner.last_accesses.recent() {
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -422,27 +422,31 @@ impl DeltaLayerWriterInner {
    /// The values must be appended in key, lsn order.
    ///
    async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
-        self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
-            .await
+        let (_, res) = self
+            .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init())
+            .await;
+        res
    }

    async fn put_value_bytes(
        &mut self,
        key: Key,
        lsn: Lsn,
-        val: &[u8],
+        val: Vec<u8>,
        will_init: bool,
-    ) -> anyhow::Result<()> {
+    ) -> (Vec<u8>, anyhow::Result<()>) {
        assert!(self.lsn_range.start <= lsn);
-
-        let off = self.blob_writer.write_blob(val).await?;
+        let (val, res) = self.blob_writer.write_blob(val).await;
+        let off = match res {
+            Ok(off) => off,
+            Err(e) => return (val, Err(anyhow::anyhow!(e))),
+        };

        let blob_ref = BlobRef::new(off, will_init);

        let delta_key = DeltaKey::from_key_lsn(&key, lsn);
-        self.tree.append(&delta_key.0, blob_ref.0)?;
-
-        Ok(())
+        let res = self.tree.append(&delta_key.0, blob_ref.0);
+        (val, res.map_err(|e| anyhow::anyhow!(e)))
    }

    fn size(&self) -> u64 {
@@ -463,7 +467,8 @@ impl DeltaLayerWriterInner {
        file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
            .await?;
        for buf in block_buf.blocks {
-            file.write_all(buf.as_ref()).await?;
+            let (_buf, res) = file.write_all(buf).await;
+            res?;
        }
        assert!(self.lsn_range.start < self.lsn_range.end);
        // Fill in the summary on blk 0
@@ -478,17 +483,12 @@ impl DeltaLayerWriterInner {
            index_root_blk,
        };

-        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        let mut buf = Vec::with_capacity(PAGE_SZ);
+        // TODO: could use smallvec here but it's a pain with Slice<T>
        Summary::ser_into(&summary, &mut buf)?;
-        if buf.spilled() {
-            // This is bad as we only have one free block for the summary
-            warn!(
-                "Used more than one page size for summary buffer: {}",
-                buf.len()
-            );
-        }
        file.seek(SeekFrom::Start(0)).await?;
-        file.write_all(&buf).await?;
+        let (_buf, res) = file.write_all(buf).await;
+        res?;

        let metadata = file
            .metadata()
@@ -593,9 +593,9 @@ impl DeltaLayerWriter {
        &mut self,
        key: Key,
        lsn: Lsn,
-        val: &[u8],
+        val: Vec<u8>,
        will_init: bool,
-    ) -> anyhow::Result<()> {
+    ) -> (Vec<u8>, anyhow::Result<()>) {
        self.inner
            .as_mut()
            .unwrap()
@@ -681,18 +681,12 @@ impl DeltaLayer {

        let new_summary = rewrite(actual_summary);

-        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        let mut buf = Vec::with_capacity(PAGE_SZ);
+        // TODO: could use smallvec here, but it's a pain with Slice<T>
        Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
-        if buf.spilled() {
-            // The code in DeltaLayerWriterInner just warn!()s for this.
-            // It should probably error out as well.
-            return Err(RewriteSummaryError::Other(anyhow::anyhow!(
-                "Used more than one page size for summary buffer: {}",
-                buf.len()
-            )));
-        }
        file.seek(SeekFrom::Start(0)).await?;
-        file.write_all(&buf).await?;
+        let (_buf, res) = file.write_all(buf).await;
+        res?;
        Ok(())
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -343,18 +343,12 @@ impl ImageLayer {

        let new_summary = rewrite(actual_summary);

-        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        let mut buf = Vec::with_capacity(PAGE_SZ);
+        // TODO: could use smallvec here but it's a pain with Slice<T>
        Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
-        if buf.spilled() {
-            // The code in ImageLayerWriterInner just warn!()s for this.
-            // It should probably error out as well.
-            return Err(RewriteSummaryError::Other(anyhow::anyhow!(
-                "Used more than one page size for summary buffer: {}",
-                buf.len()
-            )));
-        }
        file.seek(SeekFrom::Start(0)).await?;
-        file.write_all(&buf).await?;
+        let (_buf, res) = file.write_all(buf).await;
+        res?;
        Ok(())
    }
 }
@@ -601,9 +595,11 @@ impl ImageLayerWriterInner {
    ///
    /// The page versions must be appended in blknum order.
    ///
-    async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
+    async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
-        let off = self.blob_writer.write_blob(img).await?;
+        let (_img, res) = self.blob_writer.write_blob(img).await;
+        // TODO: re-use the buffer for `img` further upstack
+        let off = res?;

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
        key.write_to_byte_slice(&mut keybuf);
@@ -626,7 +622,8 @@ impl ImageLayerWriterInner {
            .await?;
        let (index_root_blk, block_buf) = self.tree.finish()?;
        for buf in block_buf.blocks {
-            file.write_all(buf.as_ref()).await?;
+            let (_buf, res) = file.write_all(buf).await;
+            res?;
        }

        // Fill in the summary on blk 0
@@ -641,17 +638,12 @@ impl ImageLayerWriterInner {
            index_root_blk,
        };

-        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        let mut buf = Vec::with_capacity(PAGE_SZ);
+        // TODO: could use smallvec here but it's a pain with Slice<T>
        Summary::ser_into(&summary, &mut buf)?;
-        if buf.spilled() {
-            // This is bad as we only have one free block for the summary
-            warn!(
-                "Used more than one page size for summary buffer: {}",
-                buf.len()
-            );
-        }
        file.seek(SeekFrom::Start(0)).await?;
-        file.write_all(&buf).await?;
+        let (_buf, res) = file.write_all(buf).await;
+        res?;

        let metadata = file
            .metadata()
@@ -732,7 +724,7 @@ impl ImageLayerWriter {
    ///
    /// The page versions must be appended in blknum order.
    ///
-    pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
+    pub async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
        self.inner.as_mut().unwrap().put_image(key, img).await
    }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -474,9 +474,11 @@ impl InMemoryLayer {
            for (lsn, pos) in vec_map.as_slice() {
                cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
                let will_init = Value::des(&buf)?.will_init();
-                delta_layer_writer
-                    .put_value_bytes(key, *lsn, &buf, will_init)
-                    .await?;
+                let res;
+                (buf, res) = delta_layer_writer
+                    .put_value_bytes(key, *lsn, buf, will_init)
+                    .await;
+                res?;
            }
        }

--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -325,8 +325,8 @@ impl Layer {
        })
    }

-    pub(crate) async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        self.0.info(reset).await
+    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        self.0.info(reset)
    }

    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
@@ -637,10 +637,10 @@ impl LayerInner {
        let mut rx = self.status.subscribe();

        let strong = {
-            match self.inner.get_mut().await {
+            match self.inner.get() {
                Some(mut either) => {
                    self.wanted_evicted.store(true, Ordering::Relaxed);
-                    ResidentOrWantedEvicted::downgrade(&mut either)
+                    either.downgrade()
                }
                None => return Err(EvictionError::NotFound),
            }
@@ -666,7 +666,7 @@ impl LayerInner {
                // use however late (compared to the initial expressing of wanted) as the
                // "outcome" now
                LAYER_IMPL_METRICS.inc_broadcast_lagged();
-                match self.inner.get_mut().await {
+                match self.inner.get() {
                    Some(_) => Err(EvictionError::Downloaded),
                    None => Ok(()),
                }
@@ -784,7 +784,7 @@ impl LayerInner {
                // use the already held initialization permit because it is impossible to hit the
                // below paths anymore essentially limiting the max loop iterations to 2.
                let (value, init_permit) = download(init_permit).await?;
-                let mut guard = self.inner.set(value, init_permit).await;
+                let mut guard = self.inner.set(value, init_permit);
                let (strong, _upgraded) = guard
                    .get_and_upgrade()
                    .expect("init creates strong reference, we held the init permit");
@@ -792,7 +792,7 @@ impl LayerInner {
            }

            let (weak, permit) = {
-                let mut locked = self.inner.get_mut_or_init(download).await?;
+                let mut locked = self.inner.get_or_init(download).await?;

                if let Some((strong, upgraded)) = locked.get_and_upgrade() {
                    if upgraded {
@@ -1014,12 +1014,12 @@ impl LayerInner {
        }
    }

-    async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.desc.filename().file_name();

        // this is not accurate: we could have the file locally but there was a cancellation
        // and now we are not in sync, or we are currently downloading it.
-        let remote = self.inner.get_mut().await.is_none();
+        let remote = self.inner.get().is_none();

        let access_stats = self.access_stats.as_api_model(reset);

@@ -1078,7 +1078,7 @@ impl LayerInner {
                    LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
                    return;
                };
-                match tokio::runtime::Handle::current().block_on(this.evict_blocking(version)) {
+                match this.evict_blocking(version) {
                    Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
                    Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
                }
@@ -1086,7 +1086,7 @@ impl LayerInner {
        }
    }

-    async fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
+    fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
        // deleted or detached timeline, don't do anything.
        let Some(timeline) = self.timeline.upgrade() else {
            return Err(EvictionCancelled::TimelineGone);
@@ -1095,7 +1095,7 @@ impl LayerInner {
        // to avoid starting a new download while we evict, keep holding on to the
        // permit.
        let _permit = {
-            let maybe_downloaded = self.inner.get_mut().await;
+            let maybe_downloaded = self.inner.get();

            let (_weak, permit) = match maybe_downloaded {
                Some(mut guard) => {
@@ -1460,10 +1460,6 @@ impl ResidentLayer {
        &self.owner.0.path
    }

-    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
-        self.owner.access_stats()
-    }
-
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.owner.metadata()
    }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -12,7 +12,9 @@ use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use enumset::EnumSet;
 use fail::fail_point;
+use futures::stream::StreamExt;
 use itertools::Itertools;
+use once_cell::sync::Lazy;
 use pageserver_api::{
    keyspace::KeySpaceAccum,
    models::{
@@ -33,17 +35,22 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::sync::gate::Gate;

-use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
+use std::{
+    array,
+    collections::{BTreeMap, BinaryHeap, HashMap, HashSet},
+    sync::atomic::AtomicU64,
+};
 use std::{
    cmp::{max, min, Ordering},
    ops::ControlFlow,
 };

+use crate::pgdatadir_mapping::DirectoryKind;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
@@ -108,10 +115,7 @@ use super::remote_timeline_client::RemoteTimelineClient;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
 use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
-use super::{
-    remote_timeline_client::index::{IndexLayerMetadata, IndexPart},
-    storage_layer::LayerFringe,
-};
+use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
@@ -260,6 +264,8 @@ pub struct Timeline {
    // in `crate::page_service` writes these metrics.
    pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,

+    directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM],
+
    /// Ensures layers aren't frozen by checkpointer between
    /// [`Timeline::get_layer_for_write`] and layer reads.
    /// Locked automatically by [`TimelineWriter`] and checkpointer.
@@ -945,6 +951,10 @@ impl Timeline {
        self.metrics.resident_physical_size_get()
    }

+    pub(crate) fn get_directory_metrics(&self) -> [u64; DirectoryKind::KINDS_NUM] {
+        array::from_fn(|idx| self.directory_metrics[idx].load(AtomicOrdering::Relaxed))
+    }
+
    ///
    /// Wait until WAL has been received and processed up to this LSN.
    ///
@@ -1424,7 +1434,7 @@ impl Timeline {
        let mut historic_layers = Vec::new();
        for historic_layer in layer_map.iter_historic_layers() {
            let historic_layer = guard.get_from_desc(&historic_layer);
-            historic_layers.push(historic_layer.info(reset).await);
+            historic_layers.push(historic_layer.info(reset));
        }

        LayerMapInfo {
@@ -1614,7 +1624,7 @@ impl Timeline {
                generation,
                shard_identity,
                pg_version,
-                layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
+                layers: Default::default(),
                wanted_image_layers: Mutex::new(None),

                walredo_mgr,
@@ -1651,6 +1661,8 @@ impl Timeline {
                    &timeline_id,
                ),

+                directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
+
                flush_loop_state: Mutex::new(FlushLoopState::NotStarted),

                layer_flush_start_tx,
@@ -2419,6 +2431,29 @@ impl Timeline {
        }
    }

+    pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) {
+        self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
+        let aux_metric =
+            self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed);
+
+        let sum_of_entries = self
+            .directory_metrics
+            .iter()
+            .map(|v| v.load(AtomicOrdering::Relaxed))
+            .sum();
+        // Set a high general threshold and a lower threshold for the auxiliary files,
+        // as we can have large numbers of relations in the db directory.
+        const SUM_THRESHOLD: u64 = 5000;
+        const AUX_THRESHOLD: u64 = 1000;
+        if sum_of_entries >= SUM_THRESHOLD || aux_metric >= AUX_THRESHOLD {
+            self.metrics
+                .directory_entries_count_gauge
+                .set(sum_of_entries);
+        } else if let Some(metric) = Lazy::get(&self.metrics.directory_entries_count_gauge) {
+            metric.set(sum_of_entries);
+        }
+    }
+
    async fn find_layer(&self, layer_file_name: &str) -> Option<Layer> {
        let guard = self.layers.read().await;
        for historic_layer in guard.layer_map().iter_historic_layers() {
@@ -2439,45 +2474,28 @@ impl Timeline {
    /// should treat this as a cue to simply skip doing any heatmap uploading
    /// for this timeline.
    pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
-        let eviction_info = self.get_local_layers_for_disk_usage_eviction().await;
+        // no point in heatmaps without remote client
+        let _remote_client = self.remote_client.as_ref()?;

-        let remote_client = match &self.remote_client {
-            Some(c) => c,
-            None => return None,
-        };
+        if !self.is_active() {
+            return None;
+        }

-        let layer_file_names = eviction_info
-            .resident_layers
-            .iter()
-            .map(|l| l.layer.get_name())
-            .collect::<Vec<_>>();
+        let guard = self.layers.read().await;

-        let decorated = match remote_client.get_layers_metadata(layer_file_names) {
-            Ok(d) => d,
-            Err(_) => {
-                // Getting metadata only fails on Timeline in bad state.
-                return None;
-            }
-        };
+        let resident = guard.resident_layers().map(|layer| {
+            let last_activity_ts = layer.access_stats().latest_activity_or_now();

-        let heatmap_layers = std::iter::zip(
-            eviction_info.resident_layers.into_iter(),
-            decorated.into_iter(),
-        )
-        .filter_map(|(layer, remote_info)| {
-            remote_info.map(|remote_info| {
-                HeatMapLayer::new(
-                    layer.layer.get_name(),
-                    IndexLayerMetadata::from(remote_info),
-                    layer.last_activity_ts,
-                )
-            })
+            HeatMapLayer::new(
+                layer.layer_desc().filename(),
+                layer.metadata().into(),
+                last_activity_ts,
+            )
        });

-        Some(HeatMapTimeline::new(
-            self.timeline_id,
-            heatmap_layers.collect(),
-        ))
+        let layers = resident.collect().await;
+
+        Some(HeatMapTimeline::new(self.timeline_id, layers))
    }
 }

@@ -3634,7 +3652,7 @@ impl Timeline {
                                    }
                                };

-                                image_layer_writer.put_image(img_key, &img).await?;
+                                image_layer_writer.put_image(img_key, img).await?;
                            }
                        }

@@ -4968,41 +4986,24 @@ impl Timeline {
    /// Returns non-remote layers for eviction.
    pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
        let guard = self.layers.read().await;
-        let layers = guard.layer_map();
-
        let mut max_layer_size: Option<u64> = None;
-        let mut resident_layers = Vec::new();

-        for l in layers.iter_historic_layers() {
-            let file_size = l.file_size();
-            max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
+        let resident_layers = guard
+            .resident_layers()
+            .map(|layer| {
+                let file_size = layer.layer_desc().file_size;
+                max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));

-            let l = guard.get_from_desc(&l);
+                let last_activity_ts = layer.access_stats().latest_activity_or_now();

-            let l = match l.keep_resident().await {
-                Ok(Some(l)) => l,
-                Ok(None) => continue,
-                Err(e) => {
-                    // these should not happen, but we cannot make them statically impossible right
-                    // now.
-                    tracing::warn!(layer=%l, "failed to keep the layer resident: {e:#}");
-                    continue;
+                EvictionCandidate {
+                    layer: layer.into(),
+                    last_activity_ts,
+                    relative_last_activity: finite_f32::FiniteF32::ZERO,
                }
-            };
-
-            let last_activity_ts = l.access_stats().latest_activity().unwrap_or_else(|| {
-                // We only use this fallback if there's an implementation error.
-                // `latest_activity` already does rate-limited warn!() log.
-                debug!(layer=%l, "last_activity returns None, using SystemTime::now");
-                SystemTime::now()
-            });
-
-            resident_layers.push(EvictionCandidate {
-                layer: l.drop_eviction_guard().into(),
-                last_activity_ts,
-                relative_last_activity: finite_f32::FiniteF32::ZERO,
-            });
-        }
+            })
+            .collect()
+            .await;

        DiskUsageEvictionInfo {
            max_layer_size,
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -6,7 +6,7 @@ use std::{
 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
-use tracing::{debug, error, info, instrument, warn, Instrument, Span};
+use tracing::{debug, error, info, instrument, warn, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId};

 use crate::{
@@ -541,12 +541,7 @@ impl DeleteTimelineFlow {
                };
                Ok(())
            }
-            .instrument({
-                let span =
-                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id);
-                span.follows_from(Span::current());
-                span
-            }),
+            .instrument(tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id)),
        );
    }

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -196,13 +196,13 @@ impl Timeline {
            ControlFlow::Continue(()) => (),
        }

-        #[allow(dead_code)]
        #[derive(Debug, Default)]
        struct EvictionStats {
            candidates: usize,
            evicted: usize,
            errors: usize,
            not_evictable: usize,
+            #[allow(dead_code)]
            skipped_for_shutdown: usize,
        }

@@ -239,12 +239,7 @@ impl Timeline {
                    }
                };

-                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
-                    // We only use this fallback if there's an implementation error.
-                    // `latest_activity` already does rate-limited warn!() log.
-                    debug!(layer=%hist_layer, "last_activity returns None, using SystemTime::now");
-                    SystemTime::now()
-                });
+                let last_activity_ts = hist_layer.access_stats().latest_activity_or_now();

                let no_activity_for = match now.duration_since(last_activity_ts) {
                    Ok(d) => d,
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,4 +1,5 @@
 use anyhow::{bail, ensure, Context, Result};
+use futures::StreamExt;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
 use tracing::trace;
@@ -20,19 +21,13 @@ use crate::{
 };

 /// Provides semantic APIs to manipulate the layer map.
+#[derive(Default)]
 pub(crate) struct LayerManager {
    layer_map: LayerMap,
    layer_fmgr: LayerFileManager<Layer>,
 }

 impl LayerManager {
-    pub(crate) fn create() -> Self {
-        Self {
-            layer_map: LayerMap::default(),
-            layer_fmgr: LayerFileManager::new(),
-        }
-    }
-
    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
        self.layer_fmgr.get_from_desc(desc)
    }
@@ -246,6 +241,32 @@ impl LayerManager {
        layer.delete_on_drop();
    }

+    pub(crate) fn resident_layers(&self) -> impl futures::stream::Stream<Item = Layer> + '_ {
+        // for small layer maps, we most likely have all resident, but for larger more are likely
+        // to be evicted assuming lots of layers correlated with longer lifespan.
+
+        let layers = self
+            .layer_map()
+            .iter_historic_layers()
+            .map(|desc| self.get_from_desc(&desc));
+
+        let layers = futures::stream::iter(layers);
+
+        layers.filter_map(|layer| async move {
+            // TODO(#6028): this query does not really need to see the ResidentLayer
+            match layer.keep_resident().await {
+                Ok(Some(layer)) => Some(layer.drop_eviction_guard()),
+                Ok(None) => None,
+                Err(e) => {
+                    // these should not happen, but we cannot make them statically impossible right
+                    // now.
+                    tracing::warn!(%layer, "failed to keep the layer resident: {e:#}");
+                    None
+                }
+            }
+        })
+    }
+
    pub(crate) fn contains(&self, layer: &Layer) -> bool {
        self.layer_fmgr.contains(layer)
    }
@@ -253,6 +274,12 @@ impl LayerManager {

 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);

+impl<T> Default for LayerFileManager<T> {
+    fn default() -> Self {
+        Self(HashMap::default())
+    }
+}
+
 impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
        // The assumption for the `expect()` is that all code maintains the following invariant:
@@ -275,10 +302,6 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
        self.0.contains_key(&layer.layer_desc().key())
    }

-    pub(crate) fn new() -> Self {
-        Self(HashMap::new())
-    }
-
    pub(crate) fn remove(&mut self, layer: &T) {
        let present = self.0.remove(&layer.layer_desc().key());
        if present.is_none() && cfg!(debug_assertions) {
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,7 +19,7 @@ use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use std::fs::{self, File};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
-use tokio_epoll_uring::IoBufMut;
+use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};

 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::os::unix::fs::FileExt;
@@ -28,9 +28,10 @@ use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
 use utils::fs_ext;

-mod io_engine;
+pub use pageserver_api::models::virtual_file as api;
+pub(crate) mod io_engine;
 mod open_options;
-pub use io_engine::IoEngineKind;
+pub(crate) use io_engine::IoEngineKind;
 pub(crate) use open_options::*;

 ///
@@ -409,10 +410,10 @@ impl VirtualFile {
    /// step, the tmp path is renamed to the final path. As renames are
    /// atomic, a crash during the write operation will never leave behind a
    /// partially written file.
-    pub async fn crashsafe_overwrite(
+    pub async fn crashsafe_overwrite<B: BoundedBuf>(
        final_path: &Utf8Path,
        tmp_path: &Utf8Path,
-        content: &[u8],
+        content: B,
    ) -> std::io::Result<()> {
        let Some(final_path_parent) = final_path.parent() else {
            return Err(std::io::Error::from_raw_os_error(
@@ -429,7 +430,8 @@ impl VirtualFile {
                .create_new(true),
        )
        .await?;
-        file.write_all(content).await?;
+        let (_content, res) = file.write_all(content).await;
+        res?;
        file.sync_all().await?;
        drop(file); // before the rename, that's important!
                    // renames are atomic
@@ -580,43 +582,69 @@ impl VirtualFile {
    }

    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
-    pub async fn write_all_at(&self, mut buf: &[u8], mut offset: u64) -> Result<(), Error> {
+    pub async fn write_all_at<B: BoundedBuf>(
+        &self,
+        buf: B,
+        mut offset: u64,
+    ) -> (B::Buf, Result<(), Error>) {
+        let buf_len = buf.bytes_init();
+        if buf_len == 0 {
+            return (Slice::into_inner(buf.slice_full()), Ok(()));
+        }
+        let mut buf = buf.slice(0..buf_len);
        while !buf.is_empty() {
-            match self.write_at(buf, offset).await {
+            // TODO: push `buf` further down
+            match self.write_at(&buf, offset).await {
                Ok(0) => {
-                    return Err(Error::new(
-                        std::io::ErrorKind::WriteZero,
-                        "failed to write whole buffer",
-                    ));
+                    return (
+                        Slice::into_inner(buf),
+                        Err(Error::new(
+                            std::io::ErrorKind::WriteZero,
+                            "failed to write whole buffer",
+                        )),
+                    );
                }
                Ok(n) => {
-                    buf = &buf[n..];
+                    buf = buf.slice(n..);
                    offset += n as u64;
                }
                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
+                Err(e) => return (Slice::into_inner(buf), Err(e)),
            }
        }
-        Ok(())
+        (Slice::into_inner(buf), Ok(()))
    }

-    pub async fn write_all(&mut self, mut buf: &[u8]) -> Result<(), Error> {
+    /// Writes `buf.slice(0..buf.bytes_init())`.
+    /// Returns the IoBuf that is underlying the BoundedBuf `buf`.
+    /// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in.
+    /// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant.
+    pub async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> (B::Buf, Result<usize, Error>) {
+        let nbytes = buf.bytes_init();
+        if nbytes == 0 {
+            return (Slice::into_inner(buf.slice_full()), Ok(0));
+        }
+        let mut buf = buf.slice(0..nbytes);
        while !buf.is_empty() {
-            match self.write(buf).await {
+            // TODO: push `Slice` further down
+            match self.write(&buf).await {
                Ok(0) => {
-                    return Err(Error::new(
-                        std::io::ErrorKind::WriteZero,
-                        "failed to write whole buffer",
-                    ));
+                    return (
+                        Slice::into_inner(buf),
+                        Err(Error::new(
+                            std::io::ErrorKind::WriteZero,
+                            "failed to write whole buffer",
+                        )),
+                    );
                }
                Ok(n) => {
-                    buf = &buf[n..];
+                    buf = buf.slice(n..);
                }
                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
+                Err(e) => return (Slice::into_inner(buf), Err(e)),
            }
        }
-        Ok(())
+        (Slice::into_inner(buf), Ok(nbytes))
    }

    async fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
@@ -675,7 +703,6 @@ where
    F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
    Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
 {
-    use tokio_epoll_uring::BoundedBuf;
    let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
    while buf.bytes_total() != 0 {
        let res;
@@ -1050,10 +1077,19 @@ mod tests {
                MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
            }
        }
-        async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> {
+        async fn write_all_at<B: BoundedBuf>(&self, buf: B, offset: u64) -> Result<(), Error> {
            match self {
-                MaybeVirtualFile::VirtualFile(file) => file.write_all_at(buf, offset).await,
-                MaybeVirtualFile::File(file) => file.write_all_at(buf, offset),
+                MaybeVirtualFile::VirtualFile(file) => {
+                    let (_buf, res) = file.write_all_at(buf, offset).await;
+                    res
+                }
+                MaybeVirtualFile::File(file) => {
+                    let buf_len = buf.bytes_init();
+                    if buf_len == 0 {
+                        return Ok(());
+                    }
+                    file.write_all_at(&buf.slice(0..buf_len), offset)
+                }
            }
        }
        async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
@@ -1062,10 +1098,19 @@ mod tests {
                MaybeVirtualFile::File(file) => file.seek(pos),
            }
        }
-        async fn write_all(&mut self, buf: &[u8]) -> Result<(), Error> {
+        async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> Result<(), Error> {
            match self {
-                MaybeVirtualFile::VirtualFile(file) => file.write_all(buf).await,
-                MaybeVirtualFile::File(file) => file.write_all(buf),
+                MaybeVirtualFile::VirtualFile(file) => {
+                    let (_buf, res) = file.write_all(buf).await;
+                    res.map(|_| ())
+                }
+                MaybeVirtualFile::File(file) => {
+                    let buf_len = buf.bytes_init();
+                    if buf_len == 0 {
+                        return Ok(());
+                    }
+                    file.write_all(&buf.slice(0..buf_len))
+                }
            }
        }

@@ -1140,7 +1185,7 @@ mod tests {
                .to_owned(),
        )
        .await?;
-        file_a.write_all(b"foobar").await?;
+        file_a.write_all(b"foobar".to_vec()).await?;

        // cannot read from a file opened in write-only mode
        let _ = file_a.read_string().await.unwrap_err();
@@ -1149,7 +1194,7 @@ mod tests {
        let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;

        // cannot write to a file opened in read-only mode
-        let _ = file_a.write_all(b"bar").await.unwrap_err();
+        let _ = file_a.write_all(b"bar".to_vec()).await.unwrap_err();

        // Try simple read
        assert_eq!("foobar", file_a.read_string().await?);
@@ -1191,8 +1236,8 @@ mod tests {
                .to_owned(),
        )
        .await?;
-        file_b.write_all_at(b"BAR", 3).await?;
-        file_b.write_all_at(b"FOO", 0).await?;
+        file_b.write_all_at(b"BAR".to_vec(), 3).await?;
+        file_b.write_all_at(b"FOO".to_vec(), 0).await?;

        assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");

@@ -1292,7 +1337,7 @@ mod tests {
        let path = testdir.join("myfile");
        let tmp_path = testdir.join("myfile.tmp");

-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
            .await
            .unwrap();
        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1301,7 +1346,7 @@ mod tests {
        assert!(!tmp_path.exists());
        drop(file);

-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar")
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar".to_vec())
            .await
            .unwrap();
        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1323,7 +1368,7 @@ mod tests {
        std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
        assert!(tmp_path.exists());

-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
            .await
            .unwrap();

--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -7,67 +7,100 @@
 //!
 //! Then use [`get`] and  [`super::OpenOptions`].

-#[derive(
-    Copy,
-    Clone,
-    PartialEq,
-    Eq,
-    Hash,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-    Debug,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum IoEngineKind {
+pub(crate) use super::api::IoEngineKind;
+#[derive(Clone, Copy)]
+#[repr(u8)]
+pub(crate) enum IoEngine {
+    NotSet,
    StdFs,
    #[cfg(target_os = "linux")]
    TokioEpollUring,
 }

-static IO_ENGINE: once_cell::sync::OnceCell<IoEngineKind> = once_cell::sync::OnceCell::new();
-
-#[cfg(not(test))]
-pub(super) fn init(engine: IoEngineKind) {
-    if IO_ENGINE.set(engine).is_err() {
-        panic!("called twice");
+impl From<IoEngineKind> for IoEngine {
+    fn from(value: IoEngineKind) -> Self {
+        match value {
+            IoEngineKind::StdFs => IoEngine::StdFs,
+            #[cfg(target_os = "linux")]
+            IoEngineKind::TokioEpollUring => IoEngine::TokioEpollUring,
+        }
    }
-    crate::metrics::virtual_file_io_engine::KIND
-        .with_label_values(&[&format!("{engine}")])
-        .set(1);
 }

-pub(super) fn get() -> &'static IoEngineKind {
-    #[cfg(test)]
-    {
-        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
-        IO_ENGINE.get_or_init(|| match std::env::var(env_var_name) {
-            Ok(v) => match v.parse::<IoEngineKind>() {
-                Ok(engine_kind) => engine_kind,
-                Err(e) => {
-                    panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
-                }
-            },
-            Err(std::env::VarError::NotPresent) => {
-                crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
-                    .parse()
-                    .unwrap()
-            }
-            Err(std::env::VarError::NotUnicode(_)) => {
-                panic!("env var {env_var_name} is not unicode");
-            }
+impl TryFrom<u8> for IoEngine {
+    type Error = u8;
+
+    fn try_from(value: u8) -> Result<Self, Self::Error> {
+        Ok(match value {
+            v if v == (IoEngine::NotSet as u8) => IoEngine::NotSet,
+            v if v == (IoEngine::StdFs as u8) => IoEngine::StdFs,
+            #[cfg(target_os = "linux")]
+            v if v == (IoEngine::TokioEpollUring as u8) => IoEngine::TokioEpollUring,
+            x => return Err(x),
        })
    }
-    #[cfg(not(test))]
-    IO_ENGINE.get().unwrap()
 }

-use std::os::unix::prelude::FileExt;
+static IO_ENGINE: AtomicU8 = AtomicU8::new(IoEngine::NotSet as u8);
+
+pub(crate) fn set(engine_kind: IoEngineKind) {
+    let engine: IoEngine = engine_kind.into();
+    IO_ENGINE.store(engine as u8, std::sync::atomic::Ordering::Relaxed);
+    #[cfg(not(test))]
+    {
+        let metric = &crate::metrics::virtual_file_io_engine::KIND;
+        metric.reset();
+        metric
+            .with_label_values(&[&format!("{engine_kind}")])
+            .set(1);
+    }
+}
+
+#[cfg(not(test))]
+pub(super) fn init(engine_kind: IoEngineKind) {
+    set(engine_kind);
+}
+
+pub(super) fn get() -> IoEngine {
+    let cur = IoEngine::try_from(IO_ENGINE.load(Ordering::Relaxed)).unwrap();
+    if cfg!(test) {
+        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
+        match cur {
+            IoEngine::NotSet => {
+                let kind = match std::env::var(env_var_name) {
+                    Ok(v) => match v.parse::<IoEngineKind>() {
+                        Ok(engine_kind) => engine_kind,
+                        Err(e) => {
+                            panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
+                        }
+                    },
+                    Err(std::env::VarError::NotPresent) => {
+                        crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
+                            .parse()
+                            .unwrap()
+                    }
+                    Err(std::env::VarError::NotUnicode(_)) => {
+                        panic!("env var {env_var_name} is not unicode");
+                    }
+                };
+                self::set(kind);
+                self::get()
+            }
+            x => x,
+        }
+    } else {
+        cur
+    }
+}
+
+use std::{
+    os::unix::prelude::FileExt,
+    sync::atomic::{AtomicU8, Ordering},
+};

 use super::FileGuard;

-impl IoEngineKind {
+impl IoEngine {
    pub(super) async fn read_at<B>(
        &self,
        file_guard: FileGuard,
@@ -78,7 +111,8 @@ impl IoEngineKind {
        B: tokio_epoll_uring::BoundedBufMut + Send,
    {
        match self {
-            IoEngineKind::StdFs => {
+            IoEngine::NotSet => panic!("not initialized"),
+            IoEngine::StdFs => {
                // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
                let dst = unsafe {
                    std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
@@ -96,7 +130,7 @@ impl IoEngineKind {
                ((file_guard, buf), res)
            }
            #[cfg(target_os = "linux")]
-            IoEngineKind::TokioEpollUring => {
+            IoEngine::TokioEpollUring => {
                let system = tokio_epoll_uring::thread_local_system().await;
                let (resources, res) = system.read(file_guard, offset, buf).await;
                (
--- a/pageserver/src/virtual_file/open_options.rs
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -1,6 +1,6 @@
 //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`];

-use super::IoEngineKind;
+use super::io_engine::IoEngine;
 use std::{os::fd::OwnedFd, path::Path};

 #[derive(Debug, Clone)]
@@ -13,9 +13,10 @@ pub enum OpenOptions {
 impl Default for OpenOptions {
    fn default() -> Self {
        match super::io_engine::get() {
-            IoEngineKind::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
+            IoEngine::NotSet => panic!("io engine not set"),
+            IoEngine::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
            #[cfg(target_os = "linux")]
-            IoEngineKind::TokioEpollUring => {
+            IoEngine::TokioEpollUring => {
                Self::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new())
            }
        }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -346,7 +346,7 @@ impl WalIngest {
                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;

                if info == pg_constants::XLOG_LOGICAL_MESSAGE {
-                    let xlrec = XlLogicalMessage::decode(&mut buf);
+                    let xlrec = crate::walrecord::XlLogicalMessage::decode(&mut buf);
                    let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
                    let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size];
                    if prefix == "neon-test" {
@@ -1695,22 +1695,22 @@ mod tests {
        let mut m = tline.begin_modification(Lsn(0x20));
        walingest.put_rel_creation(&mut m, TESTREL_A, &ctx).await?;
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
            .await?;
        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx)
            .await?;
        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x40));
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx)
            .await?;
        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x50));
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx)
            .await?;
        m.commit(&ctx).await?;

@@ -1751,46 +1751,46 @@ mod tests {
            tline
                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 0 at 2")
+            test_img("foo blk 0 at 2")
        );

        assert_eq!(
            tline
                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
        );

        assert_eq!(
            tline
                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
        );
        assert_eq!(
            tline
                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 1 at 4")
+            test_img("foo blk 1 at 4")
        );

        assert_eq!(
            tline
                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
        );
        assert_eq!(
            tline
                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 1 at 4")
+            test_img("foo blk 1 at 4")
        );
        assert_eq!(
            tline
                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 2 at 5")
+            test_img("foo blk 2 at 5")
        );

        // Truncate last block
@@ -1812,13 +1812,13 @@ mod tests {
            tline
                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
        );
        assert_eq!(
            tline
                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 1 at 4")
+            test_img("foo blk 1 at 4")
        );

        // should still see the truncated block with older LSN
@@ -1832,7 +1832,7 @@ mod tests {
            tline
                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 2 at 5")
+            test_img("foo blk 2 at 5")
        );

        // Truncate to zero length
@@ -1851,7 +1851,7 @@ mod tests {
        // Extend from 0 to 2 blocks, leaving a gap
        let mut m = tline.begin_modification(Lsn(0x70));
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx)
            .await?;
        m.commit(&ctx).await?;
        assert_eq!(
@@ -1870,13 +1870,13 @@ mod tests {
            tline
                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 1")
+            test_img("foo blk 1")
        );

        // Extend a lot more, leaving a big gap that spans across segments
        let mut m = tline.begin_modification(Lsn(0x80));
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx)
            .await?;
        m.commit(&ctx).await?;
        assert_eq!(
@@ -1897,7 +1897,7 @@ mod tests {
            tline
                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 1500")
+            test_img("foo blk 1500")
        );

        Ok(())
@@ -1915,7 +1915,7 @@ mod tests {

        let mut m = tline.begin_modification(Lsn(0x20));
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
            .await?;
        m.commit(&ctx).await?;

@@ -1952,7 +1952,7 @@ mod tests {
        // Re-create it
        let mut m = tline.begin_modification(Lsn(0x40));
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 4"), &ctx)
            .await?;
        m.commit(&ctx).await?;

@@ -1990,7 +1990,7 @@ mod tests {
        for blkno in 0..relsize {
            let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
            walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx)
                .await?;
        }
        m.commit(&ctx).await?;
@@ -2028,7 +2028,7 @@ mod tests {
                tline
                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
                    .await?,
-                TEST_IMG(&data)
+                test_img(&data)
            );
        }

@@ -2055,7 +2055,7 @@ mod tests {
                tline
                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
                    .await?,
-                TEST_IMG(&data)
+                test_img(&data)
            );
        }

@@ -2073,7 +2073,7 @@ mod tests {
                tline
                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
                    .await?,
-                TEST_IMG(&data)
+                test_img(&data)
            );
        }

@@ -2084,7 +2084,7 @@ mod tests {
        for blkno in 0..relsize {
            let data = format!("foo blk {} at {}", blkno, lsn);
            walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx)
                .await?;
        }
        m.commit(&ctx).await?;
@@ -2109,7 +2109,7 @@ mod tests {
                tline
                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
                    .await?,
-                TEST_IMG(&data)
+                test_img(&data)
            );
        }

@@ -2130,7 +2130,7 @@ mod tests {
        for blknum in 0..RELSEG_SIZE + 1 {
            lsn += 0x10;
            let mut m = tline.begin_modification(Lsn(lsn));
-            let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
+            let img = test_img(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
            walingest
                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                .await?;
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -44,6 +44,11 @@ pub enum NeonWalRecord {
        moff: MultiXactOffset,
        members: Vec<MultiXactMember>,
    },
+    /// Update the map of AUX files, either writing or dropping an entry
+    AuxFile {
+        file_path: String,
+        content: Option<Bytes>,
+    },
 }

 impl NeonWalRecord {
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,7 +22,7 @@
 mod process;

 /// Code to apply [`NeonWalRecord`]s.
-mod apply_neon;
+pub(crate) mod apply_neon;

 use crate::config::PageServerConf;
 use crate::metrics::{
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -1,7 +1,8 @@
+use crate::pgdatadir_mapping::AuxFilesDirectory;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
-use bytes::BytesMut;
+use bytes::{BufMut, BytesMut};
 use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants;
@@ -12,6 +13,7 @@ use postgres_ffi::v14::nonrelfile_utils::{
 };
 use postgres_ffi::BLCKSZ;
 use tracing::*;
+use utils::bin_ser::BeSer;

 /// Can this request be served by neon redo functions
 /// or we need to pass it to wal-redo postgres process?
@@ -230,6 +232,72 @@ pub(crate) fn apply_in_neon(
                LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
            }
        }
+        NeonWalRecord::AuxFile { file_path, content } => {
+            let mut dir = AuxFilesDirectory::des(page)?;
+            dir.upsert(file_path.clone(), content.clone());
+
+            page.clear();
+            let mut writer = page.writer();
+            dir.ser_into(&mut writer)?;
+        }
    }
    Ok(())
 }
+
+#[cfg(test)]
+mod test {
+    use bytes::Bytes;
+    use pageserver_api::key::AUX_FILES_KEY;
+
+    use super::*;
+    use std::collections::HashMap;
+
+    use crate::{pgdatadir_mapping::AuxFilesDirectory, walrecord::NeonWalRecord};
+
+    /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
+    #[test]
+    fn apply_aux_file_deltas() -> anyhow::Result<()> {
+        let base_dir = AuxFilesDirectory {
+            files: HashMap::from([
+                ("two".to_string(), Bytes::from_static(b"content0")),
+                ("three".to_string(), Bytes::from_static(b"contentX")),
+            ]),
+        };
+        let base_image = AuxFilesDirectory::ser(&base_dir)?;
+
+        let deltas = vec![
+            // Insert
+            NeonWalRecord::AuxFile {
+                file_path: "one".to_string(),
+                content: Some(Bytes::from_static(b"content1")),
+            },
+            // Update
+            NeonWalRecord::AuxFile {
+                file_path: "two".to_string(),
+                content: Some(Bytes::from_static(b"content99")),
+            },
+            // Delete
+            NeonWalRecord::AuxFile {
+                file_path: "three".to_string(),
+                content: None,
+            },
+        ];
+
+        let file_path = AUX_FILES_KEY;
+        let mut page = BytesMut::from_iter(base_image);
+
+        for record in deltas {
+            apply_in_neon(&record, file_path, &mut page)?;
+        }
+
+        let reconstructed = AuxFilesDirectory::des(&page)?;
+        let expect = HashMap::from([
+            ("one".to_string(), Bytes::from_static(b"content1")),
+            ("two".to_string(), Bytes::from_static(b"content99")),
+        ]);
+
+        assert_eq!(reconstructed.files, expect);
+
+        Ok(())
+    }
+}