Merge remote-tracking branch 'origin/problame/batching-sidecar-task' into problame/batching-metrics-improvements

2026-05-25 17:10:38 +00:00 · 2024-11-29 17:58:49 +01:00
parent c4f92a21bf 2cab051921
commit 6d36c07a33
281 changed files with 19632 additions and 2395 deletions
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -126,6 +126,7 @@ fn main() -> anyhow::Result<()> {
    // after setting up logging, log the effective IO engine choice and read path implementations
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
+    info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");

    // The tenants directory contains all the pageserver local disk state.
    // Create if not exists and make sure all the contents are durable before proceeding.
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -14,6 +14,7 @@ use remote_storage::{RemotePath, RemoteStorageConfig};
 use std::env;
 use storage_broker::Uri;
 use utils::logging::SecretString;
+use utils::postgres_client::PostgresClientProtocol;

 use once_cell::sync::OnceCell;
 use reqwest::Url;
@@ -144,6 +145,10 @@ pub struct PageServerConf {
    /// JWT token for use with the control plane API.
    pub control_plane_api_token: Option<SecretString>,

+    pub import_pgdata_upcall_api: Option<Url>,
+    pub import_pgdata_upcall_api_token: Option<SecretString>,
+    pub import_pgdata_aws_endpoint_url: Option<Url>,
+
    /// If true, pageserver will make best-effort to operate without a control plane: only
    /// for use in major incidents.
    pub control_plane_emergency_mode: bool,
@@ -183,7 +188,9 @@ pub struct PageServerConf {
    /// Optionally disable disk syncs (unsafe!)
    pub no_sync: bool,

-    pub page_service_pipelining: Option<pageserver_api::config::PageServicePipeliningConfig>,
+    pub wal_receiver_protocol: PostgresClientProtocol,
+
+    pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,
 }

 /// Token for authentication to safekeepers
@@ -326,6 +333,9 @@ impl PageServerConf {
            control_plane_api,
            control_plane_api_token,
            control_plane_emergency_mode,
+            import_pgdata_upcall_api,
+            import_pgdata_upcall_api_token,
+            import_pgdata_aws_endpoint_url,
            heatmap_upload_concurrency,
            secondary_download_concurrency,
            ingest_batch_size,
@@ -340,6 +350,7 @@ impl PageServerConf {
            virtual_file_io_engine,
            tenant_config,
            no_sync,
+            wal_receiver_protocol,
            page_service_pipelining,
        } = config_toml;

@@ -380,6 +391,10 @@ impl PageServerConf {
            image_compression,
            timeline_offloading,
            ephemeral_bytes_per_memory_kb,
+            import_pgdata_upcall_api,
+            import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from),
+            import_pgdata_aws_endpoint_url,
+            wal_receiver_protocol,
            page_service_pipelining,

            // ------------------------------------------------------------
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -1144,18 +1144,24 @@ pub(crate) mod mock {
        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
        executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
        cancel: CancellationToken,
+        executed: Arc<AtomicUsize>,
    }

    impl ConsumerState {
-        async fn consume(&mut self, remote_storage: &GenericRemoteStorage) -> usize {
-            let mut executed = 0;
-
+        async fn consume(&mut self, remote_storage: &GenericRemoteStorage) {
            info!("Executing all pending deletions");

            // Transform all executor messages to generic frontend messages
-            while let Ok(msg) = self.executor_rx.try_recv() {
+            loop {
+                use either::Either;
+                let msg = tokio::select! {
+                    left = self.executor_rx.recv() => Either::Left(left),
+                    right = self.rx.recv() => Either::Right(right),
+                };
                match msg {
-                    DeleterMessage::Delete(objects) => {
+                    Either::Left(None) => break,
+                    Either::Right(None) => break,
+                    Either::Left(Some(DeleterMessage::Delete(objects))) => {
                        for path in objects {
                            match remote_storage.delete(&path, &self.cancel).await {
                                Ok(_) => {
@@ -1165,18 +1171,13 @@ pub(crate) mod mock {
                                    error!("Failed to delete {path}, leaking object! ({e})");
                                }
                            }
-                            executed += 1;
+                            self.executed.fetch_add(1, Ordering::Relaxed);
                        }
                    }
-                    DeleterMessage::Flush(flush_op) => {
+                    Either::Left(Some(DeleterMessage::Flush(flush_op))) => {
                        flush_op.notify();
                    }
-                }
-            }
-
-            while let Ok(msg) = self.rx.try_recv() {
-                match msg {
-                    ListWriterQueueMessage::Delete(op) => {
+                    Either::Right(Some(ListWriterQueueMessage::Delete(op))) => {
                        let mut objects = op.objects;
                        for (layer, meta) in op.layers {
                            objects.push(remote_layer_path(
@@ -1198,33 +1199,27 @@ pub(crate) mod mock {
                                    error!("Failed to delete {path}, leaking object! ({e})");
                                }
                            }
-                            executed += 1;
+                            self.executed.fetch_add(1, Ordering::Relaxed);
                        }
                    }
-                    ListWriterQueueMessage::Flush(op) => {
+                    Either::Right(Some(ListWriterQueueMessage::Flush(op))) => {
                        op.notify();
                    }
-                    ListWriterQueueMessage::FlushExecute(op) => {
+                    Either::Right(Some(ListWriterQueueMessage::FlushExecute(op))) => {
                        // We have already executed all prior deletions because mock does them inline
                        op.notify();
                    }
-                    ListWriterQueueMessage::Recover(_) => {
+                    Either::Right(Some(ListWriterQueueMessage::Recover(_))) => {
                        // no-op in mock
                    }
                }
-                info!("All pending deletions have been executed");
            }
-
-            executed
        }
    }

    pub struct MockDeletionQueue {
        tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
        executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
-        executed: Arc<AtomicUsize>,
-        remote_storage: Option<GenericRemoteStorage>,
-        consumer: std::sync::Mutex<ConsumerState>,
        lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
    }

@@ -1235,29 +1230,34 @@ pub(crate) mod mock {

            let executed = Arc::new(AtomicUsize::new(0));

+            let mut consumer = ConsumerState {
+                rx,
+                executor_rx,
+                cancel: CancellationToken::new(),
+                executed: executed.clone(),
+            };
+
+            tokio::spawn(async move {
+                if let Some(remote_storage) = &remote_storage {
+                    consumer.consume(remote_storage).await;
+                }
+            });
+
            Self {
                tx,
                executor_tx,
-                executed,
-                remote_storage,
-                consumer: std::sync::Mutex::new(ConsumerState {
-                    rx,
-                    executor_rx,
-                    cancel: CancellationToken::new(),
-                }),
                lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
            }
        }

        #[allow(clippy::await_holding_lock)]
        pub async fn pump(&self) {
-            if let Some(remote_storage) = &self.remote_storage {
-                // Permit holding mutex across await, because this is only ever
-                // called once at a time in tests.
-                let mut locked = self.consumer.lock().unwrap();
-                let count = locked.consume(remote_storage).await;
-                self.executed.fetch_add(count, Ordering::Relaxed);
-            }
+            let (tx, rx) = tokio::sync::oneshot::channel();
+            self.executor_tx
+                .send(DeleterMessage::Flush(FlushOp { tx }))
+                .await
+                .expect("Failed to send flush message");
+            rx.await.ok();
        }

        pub(crate) fn new_client(&self) -> DeletionQueueClient {
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -15,6 +15,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::info;
 use tracing::warn;
 use utils::backoff;
+use utils::pausable_failpoint;

 use crate::metrics;

@@ -90,6 +91,7 @@ impl Deleter {
    /// Block until everything in accumulator has been executed
    async fn flush(&mut self) -> Result<(), DeletionQueueError> {
        while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
+            pausable_failpoint!("deletion-queue-before-execute-pause");
            match self.remote_delete().await {
                Ok(()) => {
                    // Note: we assume that the remote storage layer returns Ok(()) if some
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -623,6 +623,8 @@ paths:
                existing_initdb_timeline_id:
                  type: string
                  format: hex
+                import_pgdata:
+                  $ref: "#/components/schemas/TimelineCreateRequestImportPgdata"
      responses:
        "201":
          description: Timeline was created, or already existed with matching parameters
@@ -979,6 +981,34 @@ components:
          $ref: "#/components/schemas/TenantConfig"
        effective_config:
          $ref: "#/components/schemas/TenantConfig"
+    TimelineCreateRequestImportPgdata:
+      type: object
+      required:
+        - location
+        - idempotency_key
+      properties:
+        idempotency_key:
+          type: string
+        location:
+          $ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocation"
+    TimelineCreateRequestImportPgdataLocation:
+      type: object
+      properties:
+        AwsS3:
+          $ref: "#/components/schemas/TimelineCreateRequestImportPgdataLocationAwsS3"
+    TimelineCreateRequestImportPgdataLocationAwsS3:
+      type: object
+      properties:
+        region:
+          type: string
+        bucket:
+          type: string
+        key:
+          type: string
+      required:
+        - region
+        - bucket
+        - key
    TimelineInfo:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -40,6 +40,7 @@ use pageserver_api::models::TenantSorting;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::TimelineArchivalConfigRequest;
 use pageserver_api::models::TimelineCreateRequestMode;
+use pageserver_api::models::TimelineCreateRequestModeImportPgdata;
 use pageserver_api::models::TimelinesInfoAndOffloaded;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
@@ -55,6 +56,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
+use utils::http::endpoint::profile_cpu_handler;
 use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
 use utils::http::request::must_parse_query_param;
@@ -80,6 +82,7 @@ use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
+use crate::tenant::timeline::import_pgdata;
 use crate::tenant::timeline::offload::offload_timeline;
 use crate::tenant::timeline::offload::OffloadError;
 use crate::tenant::timeline::CompactFlags;
@@ -125,7 +128,7 @@ pub struct State {
    conf: &'static PageServerConf,
    tenant_manager: Arc<TenantManager>,
    auth: Option<Arc<SwappableJwtAuth>>,
-    allowlist_routes: Vec<Uri>,
+    allowlist_routes: &'static [&'static str],
    remote_storage: GenericRemoteStorage,
    broker_client: storage_broker::BrokerClientChannel,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
@@ -146,10 +149,13 @@ impl State {
        deletion_queue_client: DeletionQueueClient,
        secondary_controller: SecondaryController,
    ) -> anyhow::Result<Self> {
-        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
-            .iter()
-            .map(|v| v.parse().unwrap())
-            .collect::<Vec<_>>();
+        let allowlist_routes = &[
+            "/v1/status",
+            "/v1/doc",
+            "/swagger.yml",
+            "/metrics",
+            "/profile/cpu",
+        ];
        Ok(Self {
            conf,
            tenant_manager,
@@ -576,6 +582,35 @@ async fn timeline_create_handler(
            ancestor_timeline_id,
            ancestor_start_lsn,
        }),
+        TimelineCreateRequestMode::ImportPgdata {
+            import_pgdata:
+                TimelineCreateRequestModeImportPgdata {
+                    location,
+                    idempotency_key,
+                },
+        } => tenant::CreateTimelineParams::ImportPgdata(tenant::CreateTimelineParamsImportPgdata {
+            idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new(
+                idempotency_key.0,
+            ),
+            new_timeline_id,
+            location: {
+                use import_pgdata::index_part_format::Location;
+                use pageserver_api::models::ImportPgdataLocation;
+                match location {
+                    #[cfg(feature = "testing")]
+                    ImportPgdataLocation::LocalFs { path } => Location::LocalFs { path },
+                    ImportPgdataLocation::AwsS3 {
+                        region,
+                        bucket,
+                        key,
+                    } => Location::AwsS3 {
+                        region,
+                        bucket,
+                        key,
+                    },
+                }
+            },
+        }),
    };

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
@@ -3148,7 +3183,7 @@ pub fn make_router(
    if auth.is_some() {
        router = router.middleware(auth_middleware(|request| {
            let state = get_state(request);
-            if state.allowlist_routes.contains(request.uri()) {
+            if state.allowlist_routes.contains(&request.uri().path()) {
                None
            } else {
                state.auth.as_deref()
@@ -3167,6 +3202,7 @@ pub fn make_router(
    Ok(router
        .data(state)
        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
        .get("/v1/status", |r| api_handler(r, status_handler))
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -356,6 +356,25 @@ async fn timed<Fut: std::future::Future>(
    }
 }

+/// Like [`timed`], but the warning timeout only starts after `cancel` has been cancelled.
+async fn timed_after_cancellation<Fut: std::future::Future>(
+    fut: Fut,
+    name: &str,
+    warn_at: std::time::Duration,
+    cancel: &CancellationToken,
+) -> <Fut as std::future::Future>::Output {
+    let mut fut = std::pin::pin!(fut);
+
+    tokio::select! {
+        _ = cancel.cancelled() => {
+            timed(fut, name, warn_at).await
+        }
+        ret = &mut fut => {
+            ret
+        }
+    }
+}
+
 #[cfg(test)]
 mod timed_tests {
    use super::timed;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3,7 +3,7 @@ use metrics::{
    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
    register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
    IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
@@ -457,6 +457,15 @@ pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
+    register_gauge_vec!(
+        "pageserver_flush_wait_upload_seconds",
+        "Time spent waiting for preceding uploads during layer flush",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_last_record_lsn",
@@ -653,6 +662,35 @@ pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|
    .expect("failed to define a metric")
 });

+pub(crate) static RELSIZE_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_relsize_cache_entries",
+        "Number of entries in the relation size cache",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!("pageserver_relsize_cache_hits", "Relation size cache hits",)
+        .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_relsize_cache_misses",
+        "Relation size cache misses",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_relsize_cache_misses_old",
+        "Relation size cache misses where the lookup LSN is older than the last relation update"
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod initial_logical_size {
    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -2097,6 +2135,7 @@ pub(crate) struct WalIngestMetrics {
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
    pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
+    pub(crate) clear_vm_bits_unknown: IntCounterVec,
 }

 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -2125,6 +2164,12 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
        "Total number of zero gap blocks written on relation extends"
    )
    .expect("failed to define a metric"),
+    clear_vm_bits_unknown: register_int_counter_vec!(
+        "pageserver_wal_ingest_clear_vm_bits_unknown",
+        "Number of ignored ClearVmBits operations due to unknown pages/relations",
+        &["entity"],
+    )
+    .expect("failed to define a metric"),
 });

 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
@@ -2327,6 +2372,7 @@ pub(crate) struct TimelineMetrics {
    shard_id: String,
    timeline_id: String,
    pub flush_time_histo: StorageTimeMetrics,
+    pub flush_wait_upload_time_gauge: Gauge,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
    pub logical_size_histo: StorageTimeMetrics,
@@ -2370,6 +2416,9 @@ impl TimelineMetrics {
            &shard_id,
            &timeline_id,
        );
+        let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
        let compact_time_histo = StorageTimeMetrics::new(
            StorageTimeOperation::Compact,
            &tenant_id,
@@ -2507,6 +2556,7 @@ impl TimelineMetrics {
            shard_id,
            timeline_id,
            flush_time_histo,
+            flush_wait_upload_time_gauge,
            compact_time_histo,
            create_images_time_histo,
            logical_size_histo,
@@ -2554,6 +2604,14 @@ impl TimelineMetrics {
        self.resident_physical_size_gauge.get()
    }

+    pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
+        self.flush_wait_upload_time_gauge.add(duration);
+        crate::metrics::FLUSH_WAIT_UPLOAD_TIME
+            .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
+            .unwrap()
+            .add(duration);
+    }
+
    pub(crate) fn shutdown(&self) {
        let was_shutdown = self
            .shutdown
@@ -2570,6 +2628,7 @@ impl TimelineMetrics {
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -7,7 +7,10 @@ use bytes::Buf;
 use futures::FutureExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
-use pageserver_api::config::{PageServicePipeliningConfig, PageServiceProtocolPipeliningMode};
+use pageserver_api::config::{
+    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
+    PageServiceProtocolPipelinedExecutionStrategy,
+};
 use pageserver_api::models::{self, TenantState};
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
@@ -36,6 +39,7 @@ use tokio::io::{AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::sync::spsc_fold;
 use utils::{
    auth::{Claims, Scope, SwappableJwtAuth},
    id::{TenantId, TimelineId},
@@ -44,7 +48,6 @@ use utils::{
 };

 use crate::auth::check_permission;
-use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -62,6 +65,7 @@ use crate::tenant::timeline::{self, WaitLsnError};
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
+use crate::{basebackup, timed_after_cancellation};
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -158,7 +162,7 @@ pub async fn libpq_listener_main(
    auth: Option<Arc<SwappableJwtAuth>>,
    listener: tokio::net::TcpListener,
    auth_type: AuthType,
-    pipelining_config: Option<PageServicePipeliningConfig>,
+    pipelining_config: PageServicePipeliningConfig,
    listener_ctx: RequestContext,
    listener_cancel: CancellationToken,
 ) -> Connections {
@@ -217,7 +221,7 @@ async fn page_service_conn_main(
    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
-    pipelining_config: Option<PageServicePipeliningConfig>,
+    pipelining_config: PageServicePipeliningConfig,
    connection_ctx: RequestContext,
    cancel: CancellationToken,
 ) -> ConnectionHandlerResult {
@@ -319,7 +323,7 @@ struct PageServerHandler {
    /// None only while pagestream protocol is being processed.
    timeline_handles: Option<TimelineHandles>,

-    pipelining_config: Option<PageServicePipeliningConfig>,
+    pipelining_config: PageServicePipeliningConfig,
 }

 struct TimelineHandles {
@@ -574,7 +578,7 @@ impl PageServerHandler {
    pub fn new(
        tenant_manager: Arc<TenantManager>,
        auth: Option<Arc<SwappableJwtAuth>>,
-        pipelining_config: Option<PageServicePipeliningConfig>,
+        pipelining_config: PageServicePipeliningConfig,
        connection_ctx: RequestContext,
        cancel: CancellationToken,
    ) -> Self {
@@ -620,7 +624,7 @@ impl PageServerHandler {
        cancel: &CancellationToken,
        ctx: &RequestContext,
        parent_span: Span,
-    ) -> Result<Option<Box<BatchedFeMessage>>, QueryError>
+    ) -> Result<Option<BatchedFeMessage>, QueryError>
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
    {
@@ -722,7 +726,7 @@ impl PageServerHandler {
                            span,
                            error: $error,
                        };
-                        Ok(Some(Box::new(error)))
+                        Ok(Some(error))
                    }};
                }

@@ -773,7 +777,7 @@ impl PageServerHandler {
                }
            }
        };
-        Ok(Some(Box::new(batched_msg)))
+        Ok(Some(batched_msg))
    }

    /// Post-condition: `batch` is Some()
@@ -781,26 +785,25 @@ impl PageServerHandler {
    #[allow(clippy::boxed_local)]
    fn pagestream_do_batch(
        max_batch_size: NonZeroUsize,
-        batch: &mut Option<Box<BatchedFeMessage>>,
-        this_msg: Box<BatchedFeMessage>,
-    ) -> Option<Box<BatchedFeMessage>> {
+        batch: &mut Result<BatchedFeMessage, QueryError>,
+        this_msg: Result<BatchedFeMessage, QueryError>,
+    ) -> Result<(), Result<BatchedFeMessage, QueryError>> {
        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();

-        match (batch.as_deref_mut(), *this_msg) {
-            // nothing batched yet
-            (None, this_msg) => {
-                *batch = Some(Box::new(this_msg));
-                None
-            }
+        let this_msg = match this_msg {
+            Ok(this_msg) => this_msg,
+            Err(e) => return Err(Err(e)),
+        };
+
+        match (&mut *batch, this_msg) {
            // something batched already, let's see if we can add this message to the batch
            (
-                Some(BatchedFeMessage::GetPage {
+                Ok(BatchedFeMessage::GetPage {
                    span: _,
                    shard: accum_shard,
                    pages: ref mut accum_pages,
                    effective_request_lsn: accum_lsn,
                }),
-                // would be nice to have box pattern here
                BatchedFeMessage::GetPage {
                    span: _,
                    shard: this_shard,
@@ -833,12 +836,12 @@ impl PageServerHandler {
            {
                // ok to batch
                accum_pages.extend(this_pages);
-                None
+                Ok(())
            }
            // something batched already but this message is unbatchable
-            (Some(_), this_msg) => {
+            (_, this_msg) => {
                // by default, don't continue batching
-                Some(Box::new(this_msg)) // TODO: avoid re-box
+                Err(Ok(this_msg))
            }
        }
    }
@@ -848,6 +851,7 @@ impl PageServerHandler {
        &mut self,
        pgb_writer: &mut PostgresBackend<IO>,
        batch: BatchedFeMessage,
+        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<(), QueryError>
    where
@@ -984,7 +988,7 @@ impl PageServerHandler {
        }
        tokio::select! {
            biased;
-            _ = self.cancel.cancelled() => {
+            _ = cancel.cancelled() => {
                // We were requested to shut down.
                info!("shutdown request received in page handler");
                return Err(QueryError::Shutdown)
@@ -1041,8 +1045,8 @@ impl PageServerHandler {
            .expect("implementation error: timeline_handles should not be locked");

        let request_span = info_span!("request", shard_id = tracing::field::Empty);
-        let (pgb_reader, timeline_handles) = match self.pipelining_config.clone() {
-            Some(pipelining_config) => {
+        let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() {
+            PageServicePipeliningConfig::Pipelined(pipelining_config) => {
                self.handle_pagerequests_pipelined(
                    pgb,
                    pgb_reader,
@@ -1055,7 +1059,7 @@ impl PageServerHandler {
                )
                .await
            }
-            None => {
+            PageServicePipeliningConfig::Serial => {
                self.handle_pagerequests_serial(
                    pgb,
                    pgb_reader,
@@ -1067,7 +1071,7 @@ impl PageServerHandler {
                )
                .await
            }
-        }?;
+        };

        debug!("pagestream subprotocol shut down cleanly");

@@ -1077,7 +1081,7 @@ impl PageServerHandler {
        let replaced = self.timeline_handles.replace(timeline_handles);
        assert!(replaced.is_none());

-        Ok(())
+        result
    }

    #[allow(clippy::too_many_arguments)]
@@ -1090,33 +1094,50 @@ impl PageServerHandler {
        mut timeline_handles: TimelineHandles,
        request_span: Span,
        ctx: &RequestContext,
-    ) -> Result<(PostgresBackendReader<IO>, TimelineHandles), QueryError>
+    ) -> (
+        (PostgresBackendReader<IO>, TimelineHandles),
+        Result<(), QueryError>,
+    )
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
    {
-        loop {
+        let cancel = self.cancel.clone();
+        let err = loop {
            let msg = Self::pagestream_read_message(
                &mut pgb_reader,
                tenant_id,
                timeline_id,
                &mut timeline_handles,
-                &self.cancel,
+                &cancel,
                ctx,
                request_span.clone(),
            )
-            .await?;
+            .await;
+            let msg = match msg {
+                Ok(msg) => msg,
+                Err(e) => break e,
+            };
            let msg = match msg {
                Some(msg) => msg,
                None => {
                    debug!("pagestream subprotocol end observed");
-                    return Ok((pgb_reader, timeline_handles));
+                    return ((pgb_reader, timeline_handles), Ok(()));
                }
            };
-            self.pagesteam_handle_batched_message(pgb_writer, *msg, ctx)
-                .await?;
-        }
+            let err = self
+                .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx)
+                .await;
+            match err {
+                Ok(()) => {}
+                Err(e) => break e,
+            }
+        };
+        ((pgb_reader, timeline_handles), Err(err))
    }

+    /// # Cancel-Safety
+    ///
+    /// May leak tokio tasks if not polled to completion.
    #[allow(clippy::too_many_arguments)]
    async fn handle_pagerequests_pipelined<IO>(
        &mut self,
@@ -1126,185 +1147,175 @@ impl PageServerHandler {
        timeline_id: TimelineId,
        mut timeline_handles: TimelineHandles,
        request_span: Span,
-        pipelining_config: PageServicePipeliningConfig,
+        pipelining_config: PageServicePipeliningConfigPipelined,
        ctx: &RequestContext,
-    ) -> Result<(PostgresBackendReader<IO>, TimelineHandles), QueryError>
+    ) -> (
+        (PostgresBackendReader<IO>, TimelineHandles),
+        Result<(), QueryError>,
+    )
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
    {
-        let PageServicePipeliningConfig {
+        //
+        // Pipelined pagestream handling consists of
+        // - a Batcher that reads requests off the wire and
+        //   and batches them if possible,
+        // - an Executor that processes the batched requests.
+        //
+        // The batch is built up inside an `spsc_fold` channel,
+        // shared betwen Batcher (Sender) and Executor (Receiver).
+        //
+        // The Batcher continously folds client requests into the batch,
+        // while the Executor can at any time take out what's in the batch
+        // in order to process it.
+        // This means the next batch builds up while the Executor
+        // executes the last batch.
+        //
+        // CANCELLATION
+        //
+        // We run both Batcher and Executor futures to completion before
+        // returning from this function.
+        //
+        // If Executor exits first, it signals cancellation to the Batcher
+        // via a CancellationToken that is child of `self.cancel`.
+        // If Batcher exits first, it signals cancellation to the Executor
+        // by dropping the spsc_fold channel Sender.
+        //
+        // CLEAN SHUTDOWN
+        //
+        // Clean shutdown means that the client ends the COPYBOTH session.
+        // In response to such a client message, the Batcher exits.
+        // The Executor continues to run, draining the spsc_fold channel.
+        // Once drained, the spsc_fold recv will fail with a distinct error
+        // indicating that the sender disconnected.
+        // The Executor exits with Ok(()) in response to that error.
+        //
+        // Server initiated shutdown is not clean shutdown, but instead
+        // is an error Err(QueryError::Shutdown) that is propagated through
+        // error propagation.
+        //
+        // ERROR PROPAGATION
+        //
+        // When the Batcher encounter an error, it sends it as a value
+        // through the spsc_fold channel and exits afterwards.
+        // When the Executor observes such an error in the channel,
+        // it exits returning that error value.
+        //
+        // This design ensures that the Executor stage will still process
+        // the batch that was in flight when the Batcher encountered an error,
+        // thereby beahving identical to a serial implementation.
+
+        let PageServicePipeliningConfigPipelined {
            max_batch_size,
-            protocol_pipelining_mode,
+            execution,
        } = pipelining_config;

-        let (requests_tx, mut requests_rx) = tokio::sync::mpsc::channel(1);
-        let read_messages = {
-            let cancel = self.cancel.child_token();
+        // Macro to _define_ a pipeline stage.
+        macro_rules! pipeline_stage {
+            ($name:literal, $cancel:expr, $make_fut:expr) => {{
+                let cancel: CancellationToken = $cancel;
+                let stage_fut = $make_fut(cancel.clone());
+                async move {
+                    scopeguard::defer! {
+                        debug!("exiting");
+                    }
+                    timed_after_cancellation(stage_fut, $name, Duration::from_millis(100), &cancel)
+                        .await
+                }
+                .instrument(tracing::info_span!($name))
+            }};
+        }
+
+        //
+        // Batcher
+        //
+
+        let cancel_batcher = self.cancel.child_token();
+        let (mut batch_tx, mut batch_rx) = spsc_fold::channel();
+        let read_messages = pipeline_stage!(
+            "read_messages",
+            cancel_batcher.clone(),
+            move |cancel_batcher| {
+                let ctx = ctx.attached_child();
+                async move {
+                    let mut pgb_reader = pgb_reader;
+                    let mut exit = false;
+                    while !exit {
+                        let read_res = Self::pagestream_read_message(
+                            &mut pgb_reader,
+                            tenant_id,
+                            timeline_id,
+                            &mut timeline_handles,
+                            &cancel_batcher,
+                            &ctx,
+                            request_span.clone(),
+                        )
+                        .await;
+                        let Some(read_res) = read_res.transpose() else {
+                            debug!("client-initiated shutdown");
+                            break;
+                        };
+                        exit |= read_res.is_err();
+                        let could_send = batch_tx
+                            .send(read_res, |batch, res| {
+                                Self::pagestream_do_batch(max_batch_size, batch, res)
+                            })
+                            .await;
+                        exit |= could_send.is_err();
+                    }
+                    (pgb_reader, timeline_handles)
+                }
+            }
+        );
+
+        //
+        // Executor
+        //
+
+        let executor = pipeline_stage!("executor", self.cancel.clone(), move |cancel| {
            let ctx = ctx.attached_child();
            async move {
-                scopeguard::defer! {
-                    debug!("exiting");
-                }
-                let mut pgb_reader = pgb_reader;
+                let _cancel_batcher = cancel_batcher.drop_guard();
                loop {
-                    let msg = Self::pagestream_read_message(
-                        &mut pgb_reader,
-                        tenant_id,
-                        timeline_id,
-                        &mut timeline_handles,
-                        &cancel,
-                        &ctx,
-                        request_span.clone(),
-                    )
-                    .await?;
-                    let msg = match msg {
-                        Some(msg) => msg,
-                        None => {
-                            debug!("pagestream subprotocol end observed");
-                            break;
+                    let maybe_batch = batch_rx.recv().await;
+                    let batch = match maybe_batch {
+                        Ok(batch) => batch,
+                        Err(spsc_fold::RecvError::SenderGone) => {
+                            debug!("upstream gone");
+                            return Ok(());
                        }
                    };
-                    match requests_tx.send(msg).await {
-                        Ok(()) => {}
-                        Err(tokio::sync::mpsc::error::SendError(_)) => {
-                            debug!("downstream is gone");
-                            break;
+                    let batch = match batch {
+                        Ok(batch) => batch,
+                        Err(e) => {
+                            return Err(e);
                        }
-                    }
-                }
-                Ok((pgb_reader, timeline_handles))
-            }
-        }
-        .instrument(tracing::info_span!("read_messages"));
-
-        enum BatchState {
-            Building(Option<Box<BatchedFeMessage>>),
-            UpstreamDead(Option<Box<BatchedFeMessage>>),
-        }
-        impl BatchState {
-            fn must_building_mut(&mut self) -> &mut Option<Box<BatchedFeMessage>> {
-                match self {
-                    Self::Building(maybe_batch) => maybe_batch,
-                    Self::UpstreamDead(_) => panic!("upstream dead"),
-                }
-            }
-        }
-        let (batch_tx, mut batch_rx) = tokio::sync::watch::channel(Arc::new(
-            std::sync::Mutex::new(BatchState::Building(None)),
-        ));
-        let notify_batcher = Arc::new(tokio::sync::Notify::new());
-        let batcher = {
-            let notify_batcher = notify_batcher.clone();
-            async move {
-                scopeguard::defer! {
-                    debug!("exiting");
-                }
-                loop {
-                    let maybe_req = requests_rx.recv().await;
-                    let Some(req) = maybe_req else {
-                        batch_tx.send_modify(|pending_batch| {
-                            let mut guard = pending_batch.lock().unwrap();
-                            match &mut *guard {
-                                BatchState::Building(batch) => {
-                                    *guard = BatchState::UpstreamDead(batch.take());
-                                }
-                                BatchState::UpstreamDead(_) => panic!("twice"),
-                            }
-                        });
-                        break;
                    };
-                    // don't read new requests before this one has been processed
-                    let mut req = Some(req);
-                    loop {
-                        let mut wait_notified = None;
-                        let batched = batch_tx.send_if_modified(|pending_batch| {
-                            let mut guard = pending_batch.lock().unwrap();
-                            let building = guard.must_building_mut();
-                            match Self::pagestream_do_batch(
-                                max_batch_size,
-                                building,
-                                req.take().unwrap(),
-                            ) {
-                                Some(req_was_not_batched) => {
-                                    req.replace(req_was_not_batched);
-                                    wait_notified = Some(notify_batcher.notified());
-                                    false
-                                }
-                                None => true,
-                            }
-                        });
-                        if batched {
-                            break;
-                        } else {
-                            wait_notified.unwrap().await;
-                        }
-                    }
+                    self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
+                        .await?;
                }
            }
-        }
-        .instrument(tracing::info_span!("batcher"));
+        });

-        let executor = async {
-            let mut stop = false;
-            while !stop {
-                match batch_rx.changed().await {
-                    Ok(()) => {}
-                    Err(_) => {
-                        debug!("batch_rx observed disconnection of batcher");
-                    }
-                };
-                let maybe_batch = {
-                    let borrow = batch_rx.borrow();
-                    let mut guard = borrow.lock().unwrap();
-                    match &mut *guard {
-                        BatchState::Building(maybe_batch) => maybe_batch.take(),
-                        BatchState::UpstreamDead(maybe_batch) => {
-                            debug!("upstream dead");
-                            stop = true;
-                            maybe_batch.take()
-                        }
-                    }
-                };
-                let Some(batch) = maybe_batch else {
-                    break;
-                };
-                notify_batcher.notify_one();
-                debug!("processing batch");
-                self.pagesteam_handle_batched_message(pgb_writer, *batch, ctx)
-                    .await?;
-            }
-            Ok(())
-        };
+        //
+        // Execute the stages.
+        //

-        let read_messages_res;
-        let executor_res;
-        match protocol_pipelining_mode {
-            PageServiceProtocolPipeliningMode::ConcurrentFutures => {
-                (read_messages_res, _, executor_res) =
-                    tokio::join!(read_messages, batcher, executor);
+        match execution {
+            PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => {
+                tokio::join!(read_messages, executor)
            }
-            PageServiceProtocolPipeliningMode::Tasks => {
-                // cancelled via sensitivity to self.cancel
-                let read_messages_task = tokio::task::spawn(read_messages);
-                // cancelled when it observes read_messages_task disconnect the channel
-                let batcher_task = tokio::task::spawn(batcher);
-                executor_res = executor.await;
-                read_messages_res = read_messages_task
-                    .await
-                    .context("read_messages task panicked, check logs for details")?;
-                let _: () = batcher_task
-                    .await
-                    .context("batcher task panicked, check logs for details")?;
+            PageServiceProtocolPipelinedExecutionStrategy::Tasks => {
+                // These tasks are not tracked anywhere.
+                let read_messages_task = tokio::spawn(read_messages);
+                let (read_messages_task_res, executor_res_) =
+                    tokio::join!(read_messages_task, executor,);
+                (
+                    read_messages_task_res.expect("propagated panic from read_messages"),
+                    executor_res_,
+                )
            }
        }
-
-        match (read_messages_res, executor_res) {
-            (Err(e), _) | (_, Err(e)) => {
-                let e: QueryError = e;
-                Err(e)
-            }
-            (Ok((pgb_reader, timeline_handles)), Ok(())) => Ok((pgb_reader, timeline_handles)),
-        }
    }

    /// Helper function to handle the LSN from client request.
@@ -1349,21 +1360,26 @@ impl PageServerHandler {
            ));
        }

-        if request_lsn < **latest_gc_cutoff_lsn {
+        // Check explicitly for INVALID just to get a less scary error message if the request is obviously bogus
+        if request_lsn == Lsn::INVALID {
+            return Err(PageStreamError::BadRequest(
+                "invalid LSN(0) in request".into(),
+            ));
+        }
+
+        // Clients should only read from recent LSNs on their timeline, or from locations holding an LSN lease.
+        //
+        // We may have older data available, but we make a best effort to detect this case and return an error,
+        // to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN).
+        if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() {
            let gc_info = &timeline.gc_info.read().unwrap();
            if !gc_info.leases.contains_key(&request_lsn) {
-                // The requested LSN is below gc cutoff and is not guarded by a lease.
-
-                // Check explicitly for INVALID just to get a less scary error message if the
-                // request is obviously bogus
-                return Err(if request_lsn == Lsn::INVALID {
-                    PageStreamError::BadRequest("invalid LSN(0) in request".into())
-                } else {
+                return Err(
                    PageStreamError::BadRequest(format!(
                        "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
                        request_lsn, **latest_gc_cutoff_lsn
                    ).into())
-                });
+                );
            }
        }

--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -10,6 +10,9 @@ use super::tenant::{PageReconstructError, Timeline};
 use crate::aux_file;
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
+use crate::metrics::{
+    RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
+};
 use crate::span::{
    debug_assert_current_span_has_tenant_and_timeline_id,
    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
@@ -389,7 +392,9 @@ impl Timeline {
        result
    }

-    // Get size of a database in blocks
+    /// Get size of a database in blocks. This is only accurate on shard 0. It will undercount on
+    /// other shards, by only accounting for relations the shard has pages for, and only accounting
+    /// for pages up to the highest page number it has stored.
    pub(crate) async fn get_db_size(
        &self,
        spcnode: Oid,
@@ -408,7 +413,10 @@ impl Timeline {
        Ok(total_blocks)
    }

-    /// Get size of a relation file
+    /// Get size of a relation file. The relation must exist, otherwise an error is returned.
+    ///
+    /// This is only accurate on shard 0. On other shards, it will return the size up to the highest
+    /// page number stored in the shard.
    pub(crate) async fn get_rel_size(
        &self,
        tag: RelTag,
@@ -444,7 +452,10 @@ impl Timeline {
        Ok(nblocks)
    }

-    /// Does relation exist?
+    /// Does the relation exist?
+    ///
+    /// Only shard 0 has a full view of the relations. Other shards only know about relations that
+    /// the shard stores pages for.
    pub(crate) async fn get_rel_exists(
        &self,
        tag: RelTag,
@@ -478,6 +489,9 @@ impl Timeline {

    /// Get a list of all existing relations in given tablespace and database.
    ///
+    /// Only shard 0 has a full view of the relations. Other shards only know about relations that
+    /// the shard stores pages for.
+    ///
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
@@ -1129,9 +1143,12 @@ impl Timeline {
        let rel_size_cache = self.rel_size_cache.read().unwrap();
        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
            if lsn >= *cached_lsn {
+                RELSIZE_CACHE_HITS.inc();
                return Some(*nblocks);
            }
+            RELSIZE_CACHE_MISSES_OLD.inc();
        }
+        RELSIZE_CACHE_MISSES.inc();
        None
    }

@@ -1156,6 +1173,7 @@ impl Timeline {
            }
            hash_map::Entry::Vacant(entry) => {
                entry.insert((lsn, nblocks));
+                RELSIZE_CACHE_ENTRIES.inc();
            }
        }
    }
@@ -1163,13 +1181,17 @@ impl Timeline {
    /// Store cached relation size
    pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.map.insert(tag, (lsn, nblocks));
+        if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
+            RELSIZE_CACHE_ENTRIES.inc();
+        }
    }

    /// Remove cached relation size
    pub fn remove_cached_rel_size(&self, tag: &RelTag) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.map.remove(tag);
+        if rel_size_cache.map.remove(tag).is_some() {
+            RELSIZE_CACHE_ENTRIES.dec();
+        }
    }
 }

@@ -1229,10 +1251,9 @@ impl<'a> DatadirModification<'a> {
    }

    pub(crate) fn has_dirty_data(&self) -> bool {
-        !self
-            .pending_data_batch
+        self.pending_data_batch
            .as_ref()
-            .map_or(true, |b| b.is_empty())
+            .map_or(false, |b| b.has_data())
    }

    /// Set the current lsn
@@ -1408,7 +1429,7 @@ impl<'a> DatadirModification<'a> {
            Some(pending_batch) => {
                pending_batch.extend(batch);
            }
-            None if !batch.is_empty() => {
+            None if batch.has_data() => {
                self.pending_data_batch = Some(batch);
            }
            None => {
@@ -2276,9 +2297,9 @@ impl<'a> Version<'a> {
 //--- Metadata structs stored in key-value pairs in the repository.

 #[derive(Debug, Serialize, Deserialize)]
-struct DbDirectory {
+pub(crate) struct DbDirectory {
    // (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist)
-    dbdirs: HashMap<(Oid, Oid), bool>,
+    pub(crate) dbdirs: HashMap<(Oid, Oid), bool>,
 }

 // The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of
@@ -2287,8 +2308,8 @@ struct DbDirectory {
 // "pg_twophsae/0000000A000002E4".

 #[derive(Debug, Serialize, Deserialize)]
-struct TwoPhaseDirectory {
-    xids: HashSet<TransactionId>,
+pub(crate) struct TwoPhaseDirectory {
+    pub(crate) xids: HashSet<TransactionId>,
 }

 #[derive(Debug, Serialize, Deserialize)]
@@ -2297,12 +2318,12 @@ struct TwoPhaseDirectoryV17 {
 }

 #[derive(Debug, Serialize, Deserialize, Default)]
-struct RelDirectory {
+pub(crate) struct RelDirectory {
    // Set of relations that exist. (relfilenode, forknum)
    //
    // TODO: Store it as a btree or radix tree or something else that spans multiple
    // key-value pairs, if you have a lot of relations
-    rels: HashSet<(Oid, u8)>,
+    pub(crate) rels: HashSet<(Oid, u8)>,
 }

 #[derive(Debug, Serialize, Deserialize)]
@@ -2311,9 +2332,9 @@ struct RelSizeEntry {
 }

 #[derive(Debug, Serialize, Deserialize, Default)]
-struct SlruSegmentDirectory {
+pub(crate) struct SlruSegmentDirectory {
    // Set of SLRU segments that exist.
-    segments: HashSet<u32>,
+    pub(crate) segments: HashSet<u32>,
 }

 #[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)]
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -381,6 +381,8 @@ pub enum TaskKind {
    UnitTest,

    DetachAncestor,
+
+    ImportPgdata,
 }

 #[derive(Default)]
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -43,7 +43,9 @@ use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
+use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
@@ -373,7 +375,6 @@ pub struct Tenant {

    l0_flush_global_state: L0FlushGlobalState,
 }
-
 impl std::fmt::Debug for Tenant {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{} ({})", self.tenant_shard_id, self.current_state())
@@ -860,6 +861,7 @@ impl Debug for SetStoppingError {
 pub(crate) enum CreateTimelineParams {
    Bootstrap(CreateTimelineParamsBootstrap),
    Branch(CreateTimelineParamsBranch),
+    ImportPgdata(CreateTimelineParamsImportPgdata),
 }

 #[derive(Debug)]
@@ -877,7 +879,14 @@ pub(crate) struct CreateTimelineParamsBranch {
    pub(crate) ancestor_start_lsn: Option<Lsn>,
 }

-/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in  [`Tenant::start_creating_timeline`].
+#[derive(Debug)]
+pub(crate) struct CreateTimelineParamsImportPgdata {
+    pub(crate) new_timeline_id: TimelineId,
+    pub(crate) location: import_pgdata::index_part_format::Location,
+    pub(crate) idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
+}
+
+/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in  [`Tenant::start_creating_timeline`] in  [`Tenant::start_creating_timeline`].
 ///
 /// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`].
 ///
@@ -907,19 +916,50 @@ pub(crate) enum CreateTimelineIdempotency {
        ancestor_timeline_id: TimelineId,
        ancestor_start_lsn: Lsn,
    },
+    ImportPgdata(CreatingTimelineIdempotencyImportPgdata),
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub(crate) struct CreatingTimelineIdempotencyImportPgdata {
+    idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
 }

 /// What is returned by [`Tenant::start_creating_timeline`].
 #[must_use]
-enum StartCreatingTimelineResult<'t> {
-    CreateGuard(TimelineCreateGuard<'t>),
+enum StartCreatingTimelineResult {
+    CreateGuard(TimelineCreateGuard),
    Idempotent(Arc<Timeline>),
 }

+enum TimelineInitAndSyncResult {
+    ReadyToActivate(Arc<Timeline>),
+    NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata),
+}
+
+impl TimelineInitAndSyncResult {
+    fn ready_to_activate(self) -> Option<Arc<Timeline>> {
+        match self {
+            Self::ReadyToActivate(timeline) => Some(timeline),
+            _ => None,
+        }
+    }
+}
+
+#[must_use]
+struct TimelineInitAndSyncNeedsSpawnImportPgdata {
+    timeline: Arc<Timeline>,
+    import_pgdata: import_pgdata::index_part_format::Root,
+    guard: TimelineCreateGuard,
+}
+
 /// What is returned by [`Tenant::create_timeline`].
 enum CreateTimelineResult {
    Created(Arc<Timeline>),
    Idempotent(Arc<Timeline>),
+    /// IMPORTANT: This [`Arc<Timeline>`] object is not in [`Tenant::timelines`] when
+    /// we return this result, nor will this concrete object ever be added there.
+    /// Cf method comment on [`Tenant::create_timeline_import_pgdata`].
+    ImportSpawned(Arc<Timeline>),
 }

 impl CreateTimelineResult {
@@ -927,18 +967,19 @@ impl CreateTimelineResult {
        match self {
            Self::Created(_) => "Created",
            Self::Idempotent(_) => "Idempotent",
+            Self::ImportSpawned(_) => "ImportSpawned",
        }
    }
    fn timeline(&self) -> &Arc<Timeline> {
        match self {
-            Self::Created(t) | Self::Idempotent(t) => t,
+            Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t,
        }
    }
    /// Unit test timelines aren't activated, test has to do it if it needs to.
    #[cfg(test)]
    fn into_timeline_for_test(self) -> Arc<Timeline> {
        match self {
-            Self::Created(t) | Self::Idempotent(t) => t,
+            Self::Created(t) | Self::Idempotent(t) | Self::ImportSpawned(t) => t,
        }
    }
 }
@@ -962,33 +1003,13 @@ pub enum CreateTimelineError {
 }

 #[derive(thiserror::Error, Debug)]
-enum InitdbError {
-    Other(anyhow::Error),
+pub enum InitdbError {
+    #[error("Operation was cancelled")]
    Cancelled,
-    Spawn(std::io::Result<()>),
-    Failed(std::process::ExitStatus, Vec<u8>),
-}
-
-impl fmt::Display for InitdbError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        match self {
-            InitdbError::Cancelled => write!(f, "Operation was cancelled"),
-            InitdbError::Spawn(e) => write!(f, "Spawn error: {:?}", e),
-            InitdbError::Failed(status, stderr) => write!(
-                f,
-                "Command failed with status {:?}: {}",
-                status,
-                String::from_utf8_lossy(stderr)
-            ),
-            InitdbError::Other(e) => write!(f, "Error: {:?}", e),
-        }
-    }
-}
-
-impl From<std::io::Error> for InitdbError {
-    fn from(error: std::io::Error) -> Self {
-        InitdbError::Spawn(Err(error))
-    }
+    #[error(transparent)]
+    Other(anyhow::Error),
+    #[error(transparent)]
+    Inner(postgres_initdb::Error),
 }

 enum CreateTimelineCause {
@@ -996,6 +1017,15 @@ enum CreateTimelineCause {
    Delete,
 }

+enum LoadTimelineCause {
+    Attach,
+    Unoffload,
+    ImportPgdata {
+        create_guard: TimelineCreateGuard,
+        activate: ActivateTimelineArgs,
+    },
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GcError {
    // The tenant is shutting down
@@ -1072,24 +1102,35 @@ impl Tenant {
    /// it is marked as Active.
    #[allow(clippy::too_many_arguments)]
    async fn timeline_init_and_sync(
-        &self,
+        self: &Arc<Self>,
        timeline_id: TimelineId,
        resources: TimelineResources,
-        index_part: IndexPart,
+        mut index_part: IndexPart,
        metadata: TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
-        _ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+        cause: LoadTimelineCause,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<TimelineInitAndSyncResult> {
        let tenant_id = self.tenant_shard_id;

-        let idempotency = if metadata.ancestor_timeline().is_none() {
-            CreateTimelineIdempotency::Bootstrap {
-                pg_version: metadata.pg_version(),
+        let import_pgdata = index_part.import_pgdata.take();
+        let idempotency = match &import_pgdata {
+            Some(import_pgdata) => {
+                CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata {
+                    idempotency_key: import_pgdata.idempotency_key().clone(),
+                })
            }
-        } else {
-            CreateTimelineIdempotency::Branch {
-                ancestor_timeline_id: metadata.ancestor_timeline().unwrap(),
-                ancestor_start_lsn: metadata.ancestor_lsn(),
+            None => {
+                if metadata.ancestor_timeline().is_none() {
+                    CreateTimelineIdempotency::Bootstrap {
+                        pg_version: metadata.pg_version(),
+                    }
+                } else {
+                    CreateTimelineIdempotency::Branch {
+                        ancestor_timeline_id: metadata.ancestor_timeline().unwrap(),
+                        ancestor_start_lsn: metadata.ancestor_lsn(),
+                    }
+                }
            }
        };

@@ -1121,39 +1162,91 @@ impl Tenant {
                format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
            })?;

-        {
-            // avoiding holding it across awaits
-            let mut timelines_accessor = self.timelines.lock().unwrap();
-            match timelines_accessor.entry(timeline_id) {
-                // We should never try and load the same timeline twice during startup
-                Entry::Occupied(_) => {
-                    unreachable!(
-                        "Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
-                    );
+        match import_pgdata {
+            Some(import_pgdata) if !import_pgdata.is_done() => {
+                match cause {
+                    LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (),
+                    LoadTimelineCause::ImportPgdata { .. } => {
+                        unreachable!("ImportPgdata should not be reloading timeline import is done and persisted as such in s3")
+                    }
                }
-                Entry::Vacant(v) => {
-                    v.insert(Arc::clone(&timeline));
-                    timeline.maybe_spawn_flush_loop();
+                let mut guard = self.timelines_creating.lock().unwrap();
+                if !guard.insert(timeline_id) {
+                    // We should never try and load the same timeline twice during startup
+                    unreachable!("Timeline {tenant_id}/{timeline_id} is already being created")
                }
+                let timeline_create_guard = TimelineCreateGuard {
+                    _tenant_gate_guard: self.gate.enter()?,
+                    owning_tenant: self.clone(),
+                    timeline_id,
+                    idempotency,
+                    // The users of this specific return value don't need the timline_path in there.
+                    timeline_path: timeline
+                        .conf
+                        .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id),
+                };
+                Ok(TimelineInitAndSyncResult::NeedsSpawnImportPgdata(
+                    TimelineInitAndSyncNeedsSpawnImportPgdata {
+                        timeline,
+                        import_pgdata,
+                        guard: timeline_create_guard,
+                    },
+                ))
            }
-        };
+            Some(_) | None => {
+                {
+                    let mut timelines_accessor = self.timelines.lock().unwrap();
+                    match timelines_accessor.entry(timeline_id) {
+                        // We should never try and load the same timeline twice during startup
+                        Entry::Occupied(_) => {
+                            unreachable!(
+                            "Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
+                        );
+                        }
+                        Entry::Vacant(v) => {
+                            v.insert(Arc::clone(&timeline));
+                            timeline.maybe_spawn_flush_loop();
+                        }
+                    }
+                }

-        // Sanity check: a timeline should have some content.
-        anyhow::ensure!(
-            ancestor.is_some()
-                || timeline
-                    .layers
-                    .read()
-                    .await
-                    .layer_map()
-                    .expect("currently loading, layer manager cannot be shutdown already")
-                    .iter_historic_layers()
-                    .next()
-                    .is_some(),
-            "Timeline has no ancestor and no layer files"
-        );
+                // Sanity check: a timeline should have some content.
+                anyhow::ensure!(
+                    ancestor.is_some()
+                        || timeline
+                            .layers
+                            .read()
+                            .await
+                            .layer_map()
+                            .expect("currently loading, layer manager cannot be shutdown already")
+                            .iter_historic_layers()
+                            .next()
+                            .is_some(),
+                    "Timeline has no ancestor and no layer files"
+                );

-        Ok(())
+                match cause {
+                    LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (),
+                    LoadTimelineCause::ImportPgdata {
+                        create_guard,
+                        activate,
+                    } => {
+                        // TODO: see the comment in the task code above how I'm not so certain
+                        // it is safe to activate here because of concurrent shutdowns.
+                        match activate {
+                            ActivateTimelineArgs::Yes { broker_client } => {
+                                info!("activating timeline after reload from pgdata import task");
+                                timeline.activate(self.clone(), broker_client, None, ctx);
+                            }
+                            ActivateTimelineArgs::No => (),
+                        }
+                        drop(create_guard);
+                    }
+                }
+
+                Ok(TimelineInitAndSyncResult::ReadyToActivate(timeline))
+            }
+        }
    }

    /// Attach a tenant that's available in cloud storage.
@@ -1578,24 +1671,46 @@ impl Tenant {
            }

            // TODO again handle early failure
-            self.load_remote_timeline(
-                timeline_id,
-                index_part,
-                remote_metadata,
-                TimelineResources {
-                    remote_client,
-                    timeline_get_throttle: self.timeline_get_throttle.clone(),
-                    l0_flush_global_state: self.l0_flush_global_state.clone(),
-                },
-                ctx,
-            )
-            .await
-            .with_context(|| {
-                format!(
-                    "failed to load remote timeline {} for tenant {}",
-                    timeline_id, self.tenant_shard_id
+            let effect = self
+                .load_remote_timeline(
+                    timeline_id,
+                    index_part,
+                    remote_metadata,
+                    TimelineResources {
+                        remote_client,
+                        timeline_get_throttle: self.timeline_get_throttle.clone(),
+                        l0_flush_global_state: self.l0_flush_global_state.clone(),
+                    },
+                    LoadTimelineCause::Attach,
+                    ctx,
                )
-            })?;
+                .await
+                .with_context(|| {
+                    format!(
+                        "failed to load remote timeline {} for tenant {}",
+                        timeline_id, self.tenant_shard_id
+                    )
+                })?;
+
+            match effect {
+                TimelineInitAndSyncResult::ReadyToActivate(_) => {
+                    // activation happens later, on Tenant::activate
+                }
+                TimelineInitAndSyncResult::NeedsSpawnImportPgdata(
+                    TimelineInitAndSyncNeedsSpawnImportPgdata {
+                        timeline,
+                        import_pgdata,
+                        guard,
+                    },
+                ) => {
+                    tokio::task::spawn(self.clone().create_timeline_import_pgdata_task(
+                        timeline,
+                        import_pgdata,
+                        ActivateTimelineArgs::No,
+                        guard,
+                    ));
+                }
+            }
        }

        // Walk through deleted timelines, resume deletion
@@ -1719,13 +1834,14 @@ impl Tenant {

    #[instrument(skip_all, fields(timeline_id=%timeline_id))]
    async fn load_remote_timeline(
-        &self,
+        self: &Arc<Self>,
        timeline_id: TimelineId,
        index_part: IndexPart,
        remote_metadata: TimelineMetadata,
        resources: TimelineResources,
+        cause: LoadTimelineCause,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<TimelineInitAndSyncResult> {
        span::debug_assert_current_span_has_tenant_id();

        info!("downloading index file for timeline {}", timeline_id);
@@ -1752,6 +1868,7 @@ impl Tenant {
            index_part,
            remote_metadata,
            ancestor,
+            cause,
            ctx,
        )
        .await
@@ -1938,6 +2055,7 @@ impl Tenant {
                    TimelineArchivalError::Other(anyhow::anyhow!("Timeline already exists"))
                }
                TimelineExclusionError::Other(e) => TimelineArchivalError::Other(e),
+                TimelineExclusionError::ShuttingDown => TimelineArchivalError::Cancelled,
            })?;

        let timeline_preload = self
@@ -1976,6 +2094,7 @@ impl Tenant {
            index_part,
            remote_metadata,
            timeline_resources,
+            LoadTimelineCause::Unoffload,
            &ctx,
        )
        .await
@@ -2213,7 +2332,7 @@ impl Tenant {
    ///
    /// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys.
    pub(crate) async fn create_empty_timeline(
-        &self,
+        self: &Arc<Self>,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
        pg_version: u32,
@@ -2263,7 +2382,7 @@ impl Tenant {
    // Our current tests don't need the background loops.
    #[cfg(test)]
    pub async fn create_test_timeline(
-        &self,
+        self: &Arc<Self>,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
        pg_version: u32,
@@ -2302,7 +2421,7 @@ impl Tenant {
    #[cfg(test)]
    #[allow(clippy::too_many_arguments)]
    pub async fn create_test_timeline_with_layers(
-        &self,
+        self: &Arc<Self>,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
        pg_version: u32,
@@ -2439,6 +2558,16 @@ impl Tenant {
                self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
                    .await?
            }
+            CreateTimelineParams::ImportPgdata(params) => {
+                self.create_timeline_import_pgdata(
+                    params,
+                    ActivateTimelineArgs::Yes {
+                        broker_client: broker_client.clone(),
+                    },
+                    ctx,
+                )
+                .await?
+            }
        };

        // At this point we have dropped our guard on [`Self::timelines_creating`], and
@@ -2481,11 +2610,202 @@ impl Tenant {
                );
                timeline
            }
+            CreateTimelineResult::ImportSpawned(timeline) => {
+                info!("import task spawned, timeline will become visible and activated once the import is done");
+                timeline
+            }
        };

        Ok(activated_timeline)
    }

+    /// The returned [`Arc<Timeline>`] is NOT in the [`Tenant::timelines`] map until the import
+    /// completes in the background. A DIFFERENT [`Arc<Timeline>`] will be inserted into the
+    /// [`Tenant::timelines`] map when the import completes.
+    /// We only return an [`Arc<Timeline>`] here so the API handler can create a [`pageserver_api::models::TimelineInfo`]
+    /// for the response.
+    async fn create_timeline_import_pgdata(
+        self: &Arc<Tenant>,
+        params: CreateTimelineParamsImportPgdata,
+        activate: ActivateTimelineArgs,
+        ctx: &RequestContext,
+    ) -> Result<CreateTimelineResult, CreateTimelineError> {
+        let CreateTimelineParamsImportPgdata {
+            new_timeline_id,
+            location,
+            idempotency_key,
+        } = params;
+
+        let started_at = chrono::Utc::now().naive_utc();
+
+        //
+        // There's probably a simpler way to upload an index part, but, remote_timeline_client
+        // is the canonical way we do it.
+        // - create an empty timeline in-memory
+        // - use its remote_timeline_client to do the upload
+        // - dispose of the uninit timeline
+        // - keep the creation guard alive
+
+        let timeline_create_guard = match self
+            .start_creating_timeline(
+                new_timeline_id,
+                CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata {
+                    idempotency_key: idempotency_key.clone(),
+                }),
+            )
+            .await?
+        {
+            StartCreatingTimelineResult::CreateGuard(guard) => guard,
+            StartCreatingTimelineResult::Idempotent(timeline) => {
+                return Ok(CreateTimelineResult::Idempotent(timeline))
+            }
+        };
+
+        let mut uninit_timeline = {
+            let this = &self;
+            let initdb_lsn = Lsn(0);
+            let _ctx = ctx;
+            async move {
+                let new_metadata = TimelineMetadata::new(
+                    // Initialize disk_consistent LSN to 0, The caller must import some data to
+                    // make it valid, before calling finish_creation()
+                    Lsn(0),
+                    None,
+                    None,
+                    Lsn(0),
+                    initdb_lsn,
+                    initdb_lsn,
+                    15,
+                );
+                this.prepare_new_timeline(
+                    new_timeline_id,
+                    &new_metadata,
+                    timeline_create_guard,
+                    initdb_lsn,
+                    None,
+                )
+                .await
+            }
+        }
+        .await?;
+
+        let in_progress = import_pgdata::index_part_format::InProgress {
+            idempotency_key,
+            location,
+            started_at,
+        };
+        let index_part = import_pgdata::index_part_format::Root::V1(
+            import_pgdata::index_part_format::V1::InProgress(in_progress),
+        );
+        uninit_timeline
+            .raw_timeline()
+            .unwrap()
+            .remote_client
+            .schedule_index_upload_for_import_pgdata_state_update(Some(index_part.clone()))?;
+
+        // wait_completion happens in caller
+
+        let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself();
+
+        tokio::spawn(self.clone().create_timeline_import_pgdata_task(
+            timeline.clone(),
+            index_part,
+            activate,
+            timeline_create_guard,
+        ));
+
+        // NB: the timeline doesn't exist in self.timelines at this point
+        Ok(CreateTimelineResult::ImportSpawned(timeline))
+    }
+
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))]
+    async fn create_timeline_import_pgdata_task(
+        self: Arc<Tenant>,
+        timeline: Arc<Timeline>,
+        index_part: import_pgdata::index_part_format::Root,
+        activate: ActivateTimelineArgs,
+        timeline_create_guard: TimelineCreateGuard,
+    ) {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        info!("starting");
+        scopeguard::defer! {info!("exiting")};
+
+        let res = self
+            .create_timeline_import_pgdata_task_impl(
+                timeline,
+                index_part,
+                activate,
+                timeline_create_guard,
+            )
+            .await;
+        if let Err(err) = &res {
+            error!(?err, "task failed");
+            // TODO sleep & retry, sensitive to tenant shutdown
+            // TODO: allow timeline deletion requests => should cancel the task
+        }
+    }
+
+    async fn create_timeline_import_pgdata_task_impl(
+        self: Arc<Tenant>,
+        timeline: Arc<Timeline>,
+        index_part: import_pgdata::index_part_format::Root,
+        activate: ActivateTimelineArgs,
+        timeline_create_guard: TimelineCreateGuard,
+    ) -> Result<(), anyhow::Error> {
+        let ctx = RequestContext::new(TaskKind::ImportPgdata, DownloadBehavior::Warn);
+
+        info!("importing pgdata");
+        import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone())
+            .await
+            .context("import")?;
+        info!("import done");
+
+        //
+        // Reload timeline from remote.
+        // This proves that the remote state is attachable, and it reuses the code.
+        //
+        // TODO: think about whether this is safe to do with concurrent Tenant::shutdown.
+        // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit.
+        // But our activate() call might launch new background tasks after Tenant::shutdown
+        // already went past shutting down the Tenant::timelines, which this timeline here is no part of.
+        // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting
+        // down while bootstrapping/branching + activating), but, the race condition is much more likely
+        // to manifest because of the long runtime of this import task.
+
+        //        in theory this shouldn't even .await anything except for coop yield
+        info!("shutting down timeline");
+        timeline.shutdown(ShutdownMode::Hard).await;
+        info!("timeline shut down, reloading from remote");
+        // TODO: we can't do the following check because create_timeline_import_pgdata must return an Arc<Timeline>
+        // let Some(timeline) = Arc::into_inner(timeline) else {
+        //     anyhow::bail!("implementation error: timeline that we shut down was still referenced from somewhere");
+        // };
+        let timeline_id = timeline.timeline_id;
+
+        // load from object storage like Tenant::attach does
+        let resources = self.build_timeline_resources(timeline_id);
+        let index_part = resources
+            .remote_client
+            .download_index_file(&self.cancel)
+            .await?;
+        let index_part = match index_part {
+            MaybeDeletedIndexPart::Deleted(_) => {
+                // likely concurrent delete call, cplane should prevent this
+                anyhow::bail!("index part says deleted but we are not done creating yet, this should not happen but")
+            }
+            MaybeDeletedIndexPart::IndexPart(p) => p,
+        };
+        let metadata = index_part.metadata.clone();
+        self
+            .load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{
+                create_guard: timeline_create_guard, activate, }, &ctx)
+            .await?
+            .ready_to_activate()
+            .context("implementation error: reloaded timeline still needs import after import reported success")?;
+
+        anyhow::Ok(())
+    }
+
    pub(crate) async fn delete_timeline(
        self: Arc<Self>,
        timeline_id: TimelineId,
@@ -2895,6 +3215,18 @@ impl Tenant {
            }
        }

+        if let ShutdownMode::Reload = shutdown_mode {
+            tracing::info!("Flushing deletion queue");
+            if let Err(e) = self.deletion_queue_client.flush().await {
+                match e {
+                    DeletionQueueError::ShuttingDown => {
+                        // This is the only error we expect for now. In the future, if more error
+                        // variants are added, we should handle them here.
+                    }
+                }
+            }
+        }
+
        // We cancel the Tenant's cancellation token _after_ the timelines have all shut down.  This permits
        // them to continue to do work during their shutdown methods, e.g. flushing data.
        tracing::debug!("Cancelling CancellationToken");
@@ -3337,6 +3669,13 @@ where
    Ok(result)
 }

+enum ActivateTimelineArgs {
+    Yes {
+        broker_client: storage_broker::BrokerClientChannel,
+    },
+    No,
+}
+
 impl Tenant {
    pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
        self.tenant_conf.load().tenant_conf.clone()
@@ -3520,6 +3859,7 @@ impl Tenant {
    /// `validate_ancestor == false` is used when a timeline is created for deletion
    /// and we might not have the ancestor present anymore which is fine for to be
    /// deleted timelines.
+    #[allow(clippy::too_many_arguments)]
    fn create_timeline_struct(
        &self,
        new_timeline_id: TimelineId,
@@ -4283,16 +4623,17 @@ impl Tenant {
    /// If the timeline was already created in the meantime, we check whether this
    /// request conflicts or is idempotent , based on `state`.
    async fn start_creating_timeline(
-        &self,
+        self: &Arc<Self>,
        new_timeline_id: TimelineId,
        idempotency: CreateTimelineIdempotency,
-    ) -> Result<StartCreatingTimelineResult<'_>, CreateTimelineError> {
+    ) -> Result<StartCreatingTimelineResult, CreateTimelineError> {
        let allow_offloaded = false;
        match self.create_timeline_create_guard(new_timeline_id, idempotency, allow_offloaded) {
            Ok(create_guard) => {
                pausable_failpoint!("timeline-creation-after-uninit");
                Ok(StartCreatingTimelineResult::CreateGuard(create_guard))
            }
+            Err(TimelineExclusionError::ShuttingDown) => Err(CreateTimelineError::ShuttingDown),
            Err(TimelineExclusionError::AlreadyCreating) => {
                // Creation is in progress, we cannot create it again, and we cannot
                // check if this request matches the existing one, so caller must try
@@ -4582,7 +4923,7 @@ impl Tenant {
        &'a self,
        new_timeline_id: TimelineId,
        new_metadata: &TimelineMetadata,
-        create_guard: TimelineCreateGuard<'a>,
+        create_guard: TimelineCreateGuard,
        start_lsn: Lsn,
        ancestor: Option<Arc<Timeline>>,
    ) -> anyhow::Result<UninitializedTimeline<'a>> {
@@ -4642,7 +4983,7 @@ impl Tenant {
    /// The `allow_offloaded` parameter controls whether to tolerate the existence of
    /// offloaded timelines or not.
    fn create_timeline_create_guard(
-        &self,
+        self: &Arc<Self>,
        timeline_id: TimelineId,
        idempotency: CreateTimelineIdempotency,
        allow_offloaded: bool,
@@ -4902,48 +5243,16 @@ async fn run_initdb(

    let _permit = INIT_DB_SEMAPHORE.acquire().await;

-    let mut initdb_command = tokio::process::Command::new(&initdb_bin_path);
-    initdb_command
-        .args(["--pgdata", initdb_target_dir.as_ref()])
-        .args(["--username", &conf.superuser])
-        .args(["--encoding", "utf8"])
-        .args(["--locale", &conf.locale])
-        .arg("--no-instructions")
-        .arg("--no-sync")
-        .env_clear()
-        .env("LD_LIBRARY_PATH", &initdb_lib_dir)
-        .env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
-        .stdin(std::process::Stdio::null())
-        // stdout invocation produces the same output every time, we don't need it
-        .stdout(std::process::Stdio::null())
-        // we would be interested in the stderr output, if there was any
-        .stderr(std::process::Stdio::piped());
-
-    // Before version 14, only the libc provide was available.
-    if pg_version > 14 {
-        // Version 17 brought with it a builtin locale provider which only provides
-        // C and C.UTF-8. While being safer for collation purposes since it is
-        // guaranteed to be consistent throughout a major release, it is also more
-        // performant.
-        let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
-
-        initdb_command.args(["--locale-provider", locale_provider]);
-    }
-
-    let initdb_proc = initdb_command.spawn()?;
-
-    // Ideally we'd select here with the cancellation token, but the problem is that
-    // we can't safely terminate initdb: it launches processes of its own, and killing
-    // initdb doesn't kill them. After we return from this function, we want the target
-    // directory to be able to be cleaned up.
-    // See https://github.com/neondatabase/neon/issues/6385
-    let initdb_output = initdb_proc.wait_with_output().await?;
-    if !initdb_output.status.success() {
-        return Err(InitdbError::Failed(
-            initdb_output.status,
-            initdb_output.stderr,
-        ));
-    }
+    let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
+        superuser: &conf.superuser,
+        locale: &conf.locale,
+        initdb_bin: &initdb_bin_path,
+        pg_version,
+        library_search_path: &initdb_lib_dir,
+        pgdata: initdb_target_dir,
+    })
+    .await
+    .map_err(InitdbError::Inner);

    // This isn't true cancellation support, see above. Still return an error to
    // excercise the cancellation code path.
@@ -4951,7 +5260,7 @@ async fn run_initdb(
        return Err(InitdbError::Cancelled);
    }

-    Ok(())
+    res
 }

 /// Dump contents of a layer file to stdout.
@@ -5047,6 +5356,7 @@ pub(crate) mod harness {
                lsn_lease_length: Some(tenant_conf.lsn_lease_length),
                lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
                timeline_offloading: Some(tenant_conf.timeline_offloading),
+                wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
            }
        }
    }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -19,6 +19,7 @@ use serde_json::Value;
 use std::num::NonZeroU64;
 use std::time::Duration;
 use utils::generation::Generation;
+use utils::postgres_client::PostgresClientProtocol;

 #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) enum AttachmentMode {
@@ -353,6 +354,9 @@ pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub timeline_offloading: Option<bool>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
 }

 impl TenantConfOpt {
@@ -418,6 +422,9 @@ impl TenantConfOpt {
            timeline_offloading: self
                .lazy_slru_download
                .unwrap_or(global_conf.timeline_offloading),
+            wal_receiver_protocol_override: self
+                .wal_receiver_protocol_override
+                .or(global_conf.wal_receiver_protocol_override),
        }
    }
 }
@@ -472,6 +479,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
            lsn_lease_length: value.lsn_lease_length.map(humantime),
            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
            timeline_offloading: value.timeline_offloading,
+            wal_receiver_protocol_override: value.wal_receiver_protocol_override,
        }
    }
 }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1960,7 +1960,7 @@ impl TenantManager {
            attempt.before_reset_tenant();

            let (_guard, progress) = utils::completion::channel();
-            match tenant.shutdown(progress, ShutdownMode::Flush).await {
+            match tenant.shutdown(progress, ShutdownMode::Reload).await {
                Ok(()) => {
                    slot_guard.drop_old_value().expect("it was just shutdown");
                }
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -199,7 +199,7 @@ use utils::backoff::{
 use utils::pausable_failpoint;
 use utils::shard::ShardNumber;

-use std::collections::{HashMap, VecDeque};
+use std::collections::{HashMap, HashSet, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex, OnceLock};
 use std::time::Duration;
@@ -223,7 +223,7 @@ use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::download::download_retry;
 use crate::tenant::storage_layer::AsLayerDesc;
-use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable};
+use crate::tenant::upload_queue::{Delete, OpType, UploadQueueStoppedDeletable};
 use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
    config::PageServerConf,
@@ -244,6 +244,7 @@ use self::index::IndexPart;
 use super::config::AttachedLocationConfig;
 use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerName, ResidentLayer};
+use super::timeline::import_pgdata;
 use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
 use super::{DeleteTimelineError, Generation};

@@ -813,6 +814,18 @@ impl RemoteTimelineClient {
        Ok(need_wait)
    }

+    /// Launch an index-file upload operation in the background, setting `import_pgdata` field.
+    pub(crate) fn schedule_index_upload_for_import_pgdata_state_update(
+        self: &Arc<Self>,
+        state: Option<import_pgdata::index_part_format::Root>,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        upload_queue.dirty.import_pgdata = state;
+        self.schedule_index_upload(upload_queue)?;
+        Ok(())
+    }
+
    ///
    /// Launch an index-file upload operation in the background, if necessary.
    ///
@@ -1090,7 +1103,7 @@ impl RemoteTimelineClient {
            "scheduled layer file upload {layer}",
        );

-        let op = UploadOp::UploadLayer(layer, metadata);
+        let op = UploadOp::UploadLayer(layer, metadata, None);
        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
    }
@@ -1805,7 +1818,7 @@ impl RemoteTimelineClient {
                    // have finished.
                    upload_queue.inprogress_tasks.is_empty()
                }
-                UploadOp::Delete(_) => {
+                UploadOp::Delete(..) => {
                    // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
                }
@@ -1833,19 +1846,32 @@ impl RemoteTimelineClient {
            }

            // We can launch this task. Remove it from the queue first.
-            let next_op = upload_queue.queued_operations.pop_front().unwrap();
+            let mut next_op = upload_queue.queued_operations.pop_front().unwrap();

            debug!("starting op: {}", next_op);

-            // Update the counters
-            match next_op {
-                UploadOp::UploadLayer(_, _) => {
+            // Update the counters and prepare
+            match &mut next_op {
+                UploadOp::UploadLayer(layer, meta, mode) => {
+                    if upload_queue
+                        .recently_deleted
+                        .remove(&(layer.layer_desc().layer_name().clone(), meta.generation))
+                    {
+                        *mode = Some(OpType::FlushDeletion);
+                    } else {
+                        *mode = Some(OpType::MayReorder)
+                    }
                    upload_queue.num_inprogress_layer_uploads += 1;
                }
                UploadOp::UploadMetadata { .. } => {
                    upload_queue.num_inprogress_metadata_uploads += 1;
                }
-                UploadOp::Delete(_) => {
+                UploadOp::Delete(Delete { layers }) => {
+                    for (name, meta) in layers {
+                        upload_queue
+                            .recently_deleted
+                            .insert((name.clone(), meta.generation));
+                    }
                    upload_queue.num_inprogress_deletions += 1;
                }
                UploadOp::Barrier(sender) => {
@@ -1921,7 +1947,66 @@ impl RemoteTimelineClient {
            }

            let upload_result: anyhow::Result<()> = match &task.op {
-                UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
+                UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
+                    if let Some(OpType::FlushDeletion) = mode {
+                        if self.config.read().unwrap().block_deletions {
+                            // Of course, this is not efficient... but usually the queue should be empty.
+                            let mut queue_locked = self.upload_queue.lock().unwrap();
+                            let mut detected = false;
+                            if let Ok(queue) = queue_locked.initialized_mut() {
+                                for list in queue.blocked_deletions.iter_mut() {
+                                    list.layers.retain(|(name, meta)| {
+                                        if name == &layer.layer_desc().layer_name()
+                                            && meta.generation == layer_metadata.generation
+                                        {
+                                            detected = true;
+                                            // remove the layer from deletion queue
+                                            false
+                                        } else {
+                                            // keep the layer
+                                            true
+                                        }
+                                    });
+                                }
+                            }
+                            if detected {
+                                info!(
+                                    "cancelled blocked deletion of layer {} at gen {:?}",
+                                    layer.layer_desc().layer_name(),
+                                    layer_metadata.generation
+                                );
+                            }
+                        } else {
+                            // TODO: we did not guarantee that upload task starts after deletion task, so there could be possibly race conditions
+                            // that we still get the layer deleted. But this only happens if someone creates a layer immediately after it's deleted,
+                            // which is not possible in the current system.
+                            info!(
+                                "waiting for deletion queue flush to complete before uploading layer {} at gen {:?}",
+                                layer.layer_desc().layer_name(),
+                                layer_metadata.generation
+                            );
+                            {
+                                // We are going to flush, we can clean up the recently deleted list.
+                                let mut queue_locked = self.upload_queue.lock().unwrap();
+                                if let Ok(queue) = queue_locked.initialized_mut() {
+                                    queue.recently_deleted.clear();
+                                }
+                            }
+                            if let Err(e) = self.deletion_queue_client.flush_execute().await {
+                                warn!(
+                                    "failed to flush the deletion queue before uploading layer {} at gen {:?}, still proceeding to upload: {e:#} ",
+                                    layer.layer_desc().layer_name(),
+                                    layer_metadata.generation
+                                );
+                            } else {
+                                info!(
+                                    "done flushing deletion queue before uploading layer {} at gen {:?}",
+                                    layer.layer_desc().layer_name(),
+                                    layer_metadata.generation
+                                );
+                            }
+                        }
+                    }
                    let local_path = layer.local_path();

                    // We should only be uploading layers created by this `Tenant`'s lifetime, so
@@ -2085,7 +2170,7 @@ impl RemoteTimelineClient {
            upload_queue.inprogress_tasks.remove(&task.task_id);

            let lsn_update = match task.op {
-                UploadOp::UploadLayer(_, _) => {
+                UploadOp::UploadLayer(_, _, _) => {
                    upload_queue.num_inprogress_layer_uploads -= 1;
                    None
                }
@@ -2162,7 +2247,7 @@ impl RemoteTimelineClient {
    )> {
        use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
        let res = match op {
-            UploadOp::UploadLayer(_, m) => (
+            UploadOp::UploadLayer(_, m, _) => (
                RemoteOpFileKind::Layer,
                RemoteOpKind::Upload,
                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
@@ -2259,6 +2344,7 @@ impl RemoteTimelineClient {
                        blocked_deletions: Vec::new(),
                        shutting_down: false,
                        shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+                        recently_deleted: HashSet::new(),
                    };

                    let upload_queue = std::mem::replace(
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -706,7 +706,7 @@ where
    .and_then(|x| x)
 }

-async fn download_retry_forever<T, O, F>(
+pub(crate) async fn download_retry_forever<T, O, F>(
    op: O,
    description: &str,
    cancel: &CancellationToken,
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -12,6 +12,7 @@ use utils::id::TimelineId;

 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerName;
+use crate::tenant::timeline::import_pgdata;
 use crate::tenant::Generation;
 use pageserver_api::shard::ShardIndex;

@@ -37,6 +38,13 @@ pub struct IndexPart {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub archived_at: Option<NaiveDateTime>,

+    /// This field supports import-from-pgdata ("fast imports" platform feature).
+    /// We don't currently use fast imports, so, this field is None for all production timelines.
+    /// See <https://github.com/neondatabase/neon/pull/9218> for more information.
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub import_pgdata: Option<import_pgdata::index_part_format::Root>,
+
    /// Per layer file name metadata, which can be present for a present or missing layer file.
    ///
    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
@@ -90,10 +98,11 @@ impl IndexPart {
    /// - 7: metadata_bytes is no longer written, but still read
    /// - 8: added `archived_at`
    /// - 9: +gc_blocking
-    const LATEST_VERSION: usize = 9;
+    /// - 10: +import_pgdata
+    const LATEST_VERSION: usize = 10;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10];

    pub const FILE_NAME: &'static str = "index_part.json";

@@ -108,6 +117,7 @@ impl IndexPart {
            lineage: Default::default(),
            gc_blocking: None,
            last_aux_file_policy: None,
+            import_pgdata: None,
        }
    }

@@ -381,6 +391,7 @@ mod tests {
            lineage: Lineage::default(),
            gc_blocking: None,
            last_aux_file_policy: None,
+            import_pgdata: None,
        };

        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -425,6 +436,7 @@ mod tests {
            lineage: Lineage::default(),
            gc_blocking: None,
            last_aux_file_policy: None,
+            import_pgdata: None,
        };

        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -470,6 +482,7 @@ mod tests {
            lineage: Lineage::default(),
            gc_blocking: None,
            last_aux_file_policy: None,
+            import_pgdata: None,
        };

        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -518,6 +531,7 @@ mod tests {
            lineage: Lineage::default(),
            gc_blocking: None,
            last_aux_file_policy: None,
+            import_pgdata: None,
        };

        let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -561,6 +575,7 @@ mod tests {
            lineage: Lineage::default(),
            gc_blocking: None,
            last_aux_file_policy: None,
+            import_pgdata: None,
        };

        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -607,6 +622,7 @@ mod tests {
            },
            gc_blocking: None,
            last_aux_file_policy: None,
+            import_pgdata: None,
        };

        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -658,6 +674,7 @@ mod tests {
            },
            gc_blocking: None,
            last_aux_file_policy: Some(AuxFilePolicy::V2),
+            import_pgdata: None,
        };

        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -714,6 +731,7 @@ mod tests {
            lineage: Default::default(),
            gc_blocking: None,
            last_aux_file_policy: Default::default(),
+            import_pgdata: None,
        };

        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -771,6 +789,7 @@ mod tests {
            lineage: Default::default(),
            gc_blocking: None,
            last_aux_file_policy: Default::default(),
+            import_pgdata: None,
        };

        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
@@ -833,6 +852,83 @@ mod tests {
            }),
            last_aux_file_policy: Default::default(),
            archived_at: None,
+            import_pgdata: None,
+        };
+
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    #[test]
+    fn v10_importpgdata_is_parsed() {
+        let example = r#"{
+            "version": 10,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata": {
+                "disk_consistent_lsn": "0/16960E8",
+                "prev_record_lsn": "0/1696070",
+                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/1696070",
+                "initdb_lsn": "0/1696070",
+                "pg_version": 14
+            },
+            "gc_blocking": {
+                "started_at": "2024-07-19T09:00:00.123",
+                "reasons": ["DetachAncestor"]
+            },
+            "import_pgdata": {
+                "V1": {
+                    "Done": {
+                        "idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5",
+                        "started_at": "2024-11-13T09:23:42.123",
+                        "finished_at": "2024-11-13T09:42:23.123"
+                    }
+                }
+            }
+        }"#;
+
+        let expected = IndexPart {
+            version: 10,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::new(
+                Lsn::from_str("0/16960E8").unwrap(),
+                Some(Lsn::from_str("0/1696070").unwrap()),
+                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
+                Lsn::INVALID,
+                Lsn::from_str("0/1696070").unwrap(),
+                Lsn::from_str("0/1696070").unwrap(),
+                14,
+            ).with_recalculated_checksum().unwrap(),
+            deleted_at: None,
+            lineage: Default::default(),
+            gc_blocking: Some(GcBlocking {
+                started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
+                reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
+            }),
+            last_aux_file_policy: Default::default(),
+            archived_at: None,
+            import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{
+                started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"),
+                finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"),
+                idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
+            })))
        };

        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4,6 +4,7 @@ pub mod delete;
 pub(crate) mod detach_ancestor;
 mod eviction_task;
 pub(crate) mod handle;
+pub(crate) mod import_pgdata;
 mod init;
 pub mod layer_manager;
 pub(crate) mod logical_size;
@@ -49,6 +50,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
    fs_ext, pausable_failpoint,
+    postgres_client::PostgresClientProtocol,
    sync::gate::{Gate, GateGuard},
 };
 use wal_decoder::serialized_batch::SerializedValueBatch;
@@ -892,10 +894,11 @@ pub(crate) enum ShutdownMode {
    /// While we are flushing, we continue to accept read I/O for LSNs ingested before
    /// the call to [`Timeline::shutdown`].
    FreezeAndFlush,
-    /// Only flush the layers to the remote storage without freezing any open layers. This is the
-    /// mode used by ancestor detach and any other operations that reloads a tenant but not increasing
-    /// the generation number.
-    Flush,
+    /// Only flush the layers to the remote storage without freezing any open layers. Flush the deletion
+    /// queue. This is the mode used by ancestor detach and any other operations that reloads a tenant
+    /// but not increasing the generation number. Note that this mode cannot be used at tenant shutdown,
+    /// as flushing the deletion queue at that time will cause shutdown-in-progress errors.
+    Reload,
    /// Shut down immediately, without waiting for any open layers to flush.
    Hard,
 }
@@ -1816,7 +1819,7 @@ impl Timeline {
            }
        }

-        if let ShutdownMode::Flush = mode {
+        if let ShutdownMode::Reload = mode {
            // drain the upload queue
            self.remote_client.shutdown().await;
            if !self.remote_client.no_pending_work() {
@@ -2085,6 +2088,11 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts)
    }

+    pub(crate) fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf.is_gc_blocked_by_lsn_lease_deadline()
+    }
+
    pub(crate) fn get_lazy_slru_download(&self) -> bool {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2172,6 +2180,21 @@ impl Timeline {
            )
    }

+    /// Resolve the effective WAL receiver protocol to use for this tenant.
+    ///
+    /// Priority order is:
+    /// 1. Tenant config override
+    /// 2. Default value for tenant config override
+    /// 3. Pageserver config override
+    /// 4. Pageserver config default
+    pub fn resolve_wal_receiver_protocol(&self) -> PostgresClientProtocol {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .wal_receiver_protocol_override
+            .or(self.conf.default_tenant_conf.wal_receiver_protocol_override)
+            .unwrap_or(self.conf.wal_receiver_protocol)
+    }
+
    pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) {
        // NB: Most tenant conf options are read by background loops, so,
        // changes will automatically be picked up.
@@ -2464,6 +2487,7 @@ impl Timeline {
        *guard = Some(WalReceiver::start(
            Arc::clone(self),
            WalReceiverConf {
+                protocol: self.resolve_wal_receiver_protocol(),
                wal_connect_timeout,
                lagging_wal_timeout,
                max_lsn_wal_lag,
@@ -2647,6 +2671,7 @@ impl Timeline {
        //
        // NB: generation numbers naturally protect against this because they disambiguate
        //     (1) and (4)
+        // TODO: this is basically a no-op now, should we remove it?
        self.remote_client.schedule_barrier()?;
        // Tenant::create_timeline will wait for these uploads to happen before returning, or
        // on retry.
@@ -2702,20 +2727,23 @@ impl Timeline {
                {
                    Some(cancel) => cancel.cancel(),
                    None => {
-                        let state = self.current_state();
-                        if matches!(
-                            state,
-                            TimelineState::Broken { .. } | TimelineState::Stopping
-                        ) {
-
-                            // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
-                            // Don't make noise.
-                        } else {
-                            warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
-                            debug_assert!(false);
+                        match self.current_state() {
+                            TimelineState::Broken { .. } | TimelineState::Stopping => {
+                                // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
+                                // Don't make noise.
+                            }
+                            TimelineState::Loading => {
+                                // Import does not return an activated timeline.
+                                info!("discarding priority boost for logical size calculation because timeline is not yet active");
+                            }
+                            TimelineState::Active => {
+                                // activation should be setting the once cell
+                                warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
+                                debug_assert!(false);
+                            }
                        }
                    }
-                };
+                }
            }
        }

@@ -3819,7 +3847,8 @@ impl Timeline {
        };

        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
-        // This makes us refuse ingest until the new layers have been persisted to the remote.
+        // This makes us refuse ingest until the new layers have been persisted to the remote
+        let start = Instant::now();
        self.remote_client
            .wait_completion()
            .await
@@ -3832,6 +3861,8 @@ impl Timeline {
                    FlushLayerError::Other(anyhow!(e).into())
                }
            })?;
+        let duration = start.elapsed().as_secs_f64();
+        self.metrics.flush_wait_upload_time_gauge_add(duration);

        // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
        // a compaction can delete the file and then it won't be available for uploads any more.
@@ -5886,7 +5917,7 @@ impl<'a> TimelineWriter<'a> {
        batch: SerializedValueBatch,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        if batch.is_empty() {
+        if !batch.has_data() {
            return Ok(());
        }

--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -0,0 +1,218 @@
+use std::sync::Arc;
+
+use anyhow::{bail, Context};
+use remote_storage::RemotePath;
+use tokio_util::sync::CancellationToken;
+use tracing::{info, info_span, Instrument};
+use utils::lsn::Lsn;
+
+use crate::{context::RequestContext, tenant::metadata::TimelineMetadata};
+
+use super::Timeline;
+
+mod flow;
+mod importbucket_client;
+mod importbucket_format;
+pub(crate) mod index_part_format;
+pub(crate) mod upcall_api;
+
+pub async fn doit(
+    timeline: &Arc<Timeline>,
+    index_part: index_part_format::Root,
+    ctx: &RequestContext,
+    cancel: CancellationToken,
+) -> anyhow::Result<()> {
+    let index_part_format::Root::V1(v1) = index_part;
+    let index_part_format::InProgress {
+        location,
+        idempotency_key,
+        started_at,
+    } = match v1 {
+        index_part_format::V1::Done(_) => return Ok(()),
+        index_part_format::V1::InProgress(in_progress) => in_progress,
+    };
+
+    let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?;
+
+    info!("get spec early so we know we'll be able to upcall when done");
+    let Some(spec) = storage.get_spec().await? else {
+        bail!("spec not found")
+    };
+
+    let upcall_client =
+        upcall_api::Client::new(timeline.conf, cancel.clone()).context("create upcall client")?;
+
+    //
+    // send an early progress update to clean up k8s job early and generate potentially useful logs
+    //
+    info!("send early progress update");
+    upcall_client
+        .send_progress_until_success(&spec)
+        .instrument(info_span!("early_progress_update"))
+        .await?;
+
+    let status_prefix = RemotePath::from_string("status").unwrap();
+
+    //
+    // See if shard is done.
+    // TODO: incorporate generations into status key for split brain safety. Figure out together with checkpointing.
+    //
+    let shard_status_key =
+        status_prefix.join(format!("shard-{}", timeline.tenant_shard_id.shard_slug()));
+    let shard_status: Option<importbucket_format::ShardStatus> =
+        storage.get_json(&shard_status_key).await?;
+    info!(?shard_status, "peeking shard status");
+    if shard_status.map(|st| st.done).unwrap_or(false) {
+        info!("shard status indicates that the shard is done, skipping import");
+    } else {
+        // TODO: checkpoint the progress into the IndexPart instead of restarting
+        // from the beginning.
+
+        //
+        // Wipe the slate clean - the flow does not allow resuming.
+        // We can implement resuming in the future by checkpointing the progress into the IndexPart.
+        //
+        info!("wipe the slate clean");
+        {
+            // TODO: do we need to hold GC lock for this?
+            let mut guard = timeline.layers.write().await;
+            assert!(
+                guard.layer_map()?.open_layer.is_none(),
+                "while importing, there should be no in-memory layer" // this just seems like a good place to assert it
+            );
+            let all_layers_keys = guard.all_persistent_layers();
+            let all_layers: Vec<_> = all_layers_keys
+                .iter()
+                .map(|key| guard.get_from_key(key))
+                .collect();
+            let open = guard.open_mut().context("open_mut")?;
+
+            timeline.remote_client.schedule_gc_update(&all_layers)?;
+            open.finish_gc_timeline(&all_layers);
+        }
+
+        //
+        // Wait for pgdata to finish uploading
+        //
+        info!("wait for pgdata to reach status 'done'");
+        let pgdata_status_key = status_prefix.join("pgdata");
+        loop {
+            let res = async {
+                let pgdata_status: Option<importbucket_format::PgdataStatus> = storage
+                    .get_json(&pgdata_status_key)
+                    .await
+                    .context("get pgdata status")?;
+                info!(?pgdata_status, "peeking pgdata status");
+                if pgdata_status.map(|st| st.done).unwrap_or(false) {
+                    Ok(())
+                } else {
+                    Err(anyhow::anyhow!("pgdata not done yet"))
+                }
+            }
+            .await;
+            match res {
+                Ok(_) => break,
+                Err(err) => {
+                    info!(?err, "indefintely waiting for pgdata to finish");
+                    if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled())
+                        .await
+                        .is_ok()
+                    {
+                        bail!("cancelled while waiting for pgdata");
+                    }
+                }
+            }
+        }
+
+        //
+        // Do the import
+        //
+        info!("do the import");
+        let control_file = storage.get_control_file().await?;
+        let base_lsn = control_file.base_lsn();
+
+        info!("update TimelineMetadata based on LSNs from control file");
+        {
+            let pg_version = control_file.pg_version();
+            let _ctx: &RequestContext = ctx;
+            async move {
+                // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the
+                // checkpoint record, and prev_record_lsn should point to its beginning.
+                // We should read the real end of the record from the WAL, but here we
+                // just fake it.
+                let disk_consistent_lsn = Lsn(base_lsn.0 + 8);
+                let prev_record_lsn = base_lsn;
+                let metadata = TimelineMetadata::new(
+                    disk_consistent_lsn,
+                    Some(prev_record_lsn),
+                    None,     // no ancestor
+                    Lsn(0),   // no ancestor lsn
+                    base_lsn, // latest_gc_cutoff_lsn
+                    base_lsn, // initdb_lsn
+                    pg_version,
+                );
+
+                let _start_lsn = disk_consistent_lsn + 1;
+
+                timeline
+                    .remote_client
+                    .schedule_index_upload_for_full_metadata_update(&metadata)?;
+
+                timeline.remote_client.wait_completion().await?;
+
+                anyhow::Ok(())
+            }
+        }
+        .await?;
+
+        flow::run(
+            timeline.clone(),
+            base_lsn,
+            control_file,
+            storage.clone(),
+            ctx,
+        )
+        .await?;
+
+        //
+        // Communicate that shard is done.
+        //
+        storage
+            .put_json(
+                &shard_status_key,
+                &importbucket_format::ShardStatus { done: true },
+            )
+            .await
+            .context("put shard status")?;
+    }
+
+    //
+    // Ensure at-least-once deliver of the upcall to cplane
+    // before we mark the task as done and never come here again.
+    //
+    info!("send final progress update");
+    upcall_client
+        .send_progress_until_success(&spec)
+        .instrument(info_span!("final_progress_update"))
+        .await?;
+
+    //
+    // Mark as done in index_part.
+    // This makes subsequent timeline loads enter the normal load code path
+    // instead of spawning the import task and calling this here function.
+    //
+    info!("mark import as complete in index part");
+    timeline
+        .remote_client
+        .schedule_index_upload_for_import_pgdata_state_update(Some(index_part_format::Root::V1(
+            index_part_format::V1::Done(index_part_format::Done {
+                idempotency_key,
+                started_at,
+                finished_at: chrono::Utc::now().naive_utc(),
+            }),
+        )))?;
+
+    timeline.remote_client.wait_completion().await?;
+
+    Ok(())
+}
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -0,0 +1,798 @@
+//! Import a PGDATA directory into an empty root timeline.
+//!
+//! This module is adapted hackathon code by Heikki and Stas.
+//! Other code in the parent module was written by Christian as part of a customer PoC.
+//!
+//! The hackathon code was producing image layer files as a free-standing program.
+//!
+//! It has been modified to
+//! - run inside a running Pageserver, within the proper lifecycles of Timeline -> Tenant(Shard)
+//! - => sharding-awareness: produce image layers with only the data relevant for this shard
+//! - => S3 as the source for the PGDATA instead of local filesystem
+//!
+//! TODOs before productionization:
+//! - ChunkProcessingJob size / ImportJob::total_size does not account for sharding.
+//!   => produced image layers likely too small.
+//! - ChunkProcessingJob should cut up an ImportJob to hit exactly target image layer size.
+//! - asserts / unwraps need to be replaced with errors
+//! - don't trust remote objects will be small (=prevent OOMs in those cases)
+//!     - limit all in-memory buffers in size, or download to disk and read from there
+//! - limit task concurrency
+//! - generally play nice with other tenants in the system
+//!   - importbucket is different bucket than main pageserver storage, so, should be fine wrt S3 rate limits
+//!   - but concerns like network bandwidth, local disk write bandwidth, local disk capacity, etc
+//! - integrate with layer eviction system
+//! - audit for Tenant::cancel nor Timeline::cancel responsivity
+//! - audit for Tenant/Timeline gate holding (we spawn tokio tasks during this flow!)
+//!
+//! An incomplete set of TODOs from the Hackathon:
+//! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest)
+
+use std::sync::Arc;
+
+use anyhow::{bail, ensure};
+use bytes::Bytes;
+
+use itertools::Itertools;
+use pageserver_api::{
+    key::{rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, DBDIR_KEY},
+    reltag::RelTag,
+    shard::ShardIdentity,
+};
+use postgres_ffi::{pg_constants, relfile_utils::parse_relfilename, BLCKSZ};
+use tokio::task::JoinSet;
+use tracing::{debug, info_span, instrument, Instrument};
+
+use crate::{
+    assert_u64_eq_usize::UsizeIsU64,
+    pgdatadir_mapping::{SlruSegmentDirectory, TwoPhaseDirectory},
+};
+use crate::{
+    context::{DownloadBehavior, RequestContext},
+    pgdatadir_mapping::{DbDirectory, RelDirectory},
+    task_mgr::TaskKind,
+    tenant::storage_layer::{ImageLayerWriter, Layer},
+};
+
+use pageserver_api::key::Key;
+use pageserver_api::key::{
+    slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, CHECKPOINT_KEY, CONTROLFILE_KEY,
+    TWOPHASEDIR_KEY,
+};
+use pageserver_api::keyspace::singleton_range;
+use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range};
+use pageserver_api::reltag::SlruKind;
+use utils::bin_ser::BeSer;
+use utils::lsn::Lsn;
+
+use std::collections::HashSet;
+use std::ops::Range;
+
+use super::{
+    importbucket_client::{ControlFile, RemoteStorageWrapper},
+    Timeline,
+};
+
+use remote_storage::RemotePath;
+
+pub async fn run(
+    timeline: Arc<Timeline>,
+    pgdata_lsn: Lsn,
+    control_file: ControlFile,
+    storage: RemoteStorageWrapper,
+    ctx: &RequestContext,
+) -> anyhow::Result<()> {
+    Flow {
+        timeline,
+        pgdata_lsn,
+        control_file,
+        tasks: Vec::new(),
+        storage,
+    }
+    .run(ctx)
+    .await
+}
+
+struct Flow {
+    timeline: Arc<Timeline>,
+    pgdata_lsn: Lsn,
+    control_file: ControlFile,
+    tasks: Vec<AnyImportTask>,
+    storage: RemoteStorageWrapper,
+}
+
+impl Flow {
+    /// Perform the ingestion into [`Self::timeline`].
+    /// Assumes the timeline is empty (= no layers).
+    pub async fn run(mut self, ctx: &RequestContext) -> anyhow::Result<()> {
+        let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align();
+
+        self.pgdata_lsn = pgdata_lsn;
+
+        let datadir = PgDataDir::new(&self.storage).await?;
+
+        // Import dbdir (00:00:00 keyspace)
+        // This is just constructed here, but will be written to the image layer in the first call to import_db()
+        let dbdir_buf = Bytes::from(DbDirectory::ser(&DbDirectory {
+            dbdirs: datadir
+                .dbs
+                .iter()
+                .map(|db| ((db.spcnode, db.dboid), true))
+                .collect(),
+        })?);
+        self.tasks
+            .push(ImportSingleKeyTask::new(DBDIR_KEY, dbdir_buf).into());
+
+        // Import databases (00:spcnode:dbnode keyspace for each db)
+        for db in datadir.dbs {
+            self.import_db(&db).await?;
+        }
+
+        // Import SLRUs
+
+        // pg_xact (01:00 keyspace)
+        self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact"))
+            .await?;
+        // pg_multixact/members (01:01 keyspace)
+        self.import_slru(
+            SlruKind::MultiXactMembers,
+            &self.storage.pgdata().join("pg_multixact/members"),
+        )
+        .await?;
+        // pg_multixact/offsets (01:02 keyspace)
+        self.import_slru(
+            SlruKind::MultiXactOffsets,
+            &self.storage.pgdata().join("pg_multixact/offsets"),
+        )
+        .await?;
+
+        // Import pg_twophase.
+        // TODO: as empty
+        let twophasedir_buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
+            xids: HashSet::new(),
+        })?;
+        self.tasks
+            .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                TWOPHASEDIR_KEY,
+                Bytes::from(twophasedir_buf),
+            )));
+
+        // Controlfile, checkpoint
+        self.tasks
+            .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                CONTROLFILE_KEY,
+                self.control_file.control_file_buf().clone(),
+            )));
+
+        let checkpoint_buf = self
+            .control_file
+            .control_file_data()
+            .checkPointCopy
+            .encode()?;
+        self.tasks
+            .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                CHECKPOINT_KEY,
+                checkpoint_buf,
+            )));
+
+        // Assigns parts of key space to later parallel jobs
+        let mut last_end_key = Key::MIN;
+        let mut current_chunk = Vec::new();
+        let mut current_chunk_size: usize = 0;
+        let mut parallel_jobs = Vec::new();
+        for task in std::mem::take(&mut self.tasks).into_iter() {
+            if current_chunk_size + task.total_size() > 1024 * 1024 * 1024 {
+                let key_range = last_end_key..task.key_range().start;
+                parallel_jobs.push(ChunkProcessingJob::new(
+                    key_range.clone(),
+                    std::mem::take(&mut current_chunk),
+                    &self,
+                ));
+                last_end_key = key_range.end;
+                current_chunk_size = 0;
+            }
+            current_chunk_size += task.total_size();
+            current_chunk.push(task);
+        }
+        parallel_jobs.push(ChunkProcessingJob::new(
+            last_end_key..Key::MAX,
+            current_chunk,
+            &self,
+        ));
+
+        // Start all jobs simultaneosly
+        let mut work = JoinSet::new();
+        // TODO: semaphore?
+        for job in parallel_jobs {
+            let ctx: RequestContext =
+                ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error);
+            work.spawn(async move { job.run(&ctx).await }.instrument(info_span!("parallel_job")));
+        }
+        let mut results = Vec::new();
+        while let Some(result) = work.join_next().await {
+            match result {
+                Ok(res) => {
+                    results.push(res);
+                }
+                Err(_joinset_err) => {
+                    results.push(Err(anyhow::anyhow!(
+                        "parallel job panicked or cancelled, check pageserver logs"
+                    )));
+                }
+            }
+        }
+
+        if results.iter().all(|r| r.is_ok()) {
+            Ok(())
+        } else {
+            let mut msg = String::new();
+            for result in results {
+                if let Err(err) = result {
+                    msg.push_str(&format!("{err:?}\n\n"));
+                }
+            }
+            bail!("Some parallel jobs failed:\n\n{msg}");
+        }
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))]
+    async fn import_db(&mut self, db: &PgDataDirDb) -> anyhow::Result<()> {
+        debug!("start");
+        scopeguard::defer! {
+            debug!("return");
+        }
+
+        // Import relmap (00:spcnode:dbnode:00:*:00)
+        let relmap_key = relmap_file_key(db.spcnode, db.dboid);
+        debug!("Constructing relmap entry, key {relmap_key}");
+        let relmap_path = db.path.join("pg_filenode.map");
+        let relmap_buf = self.storage.get(&relmap_path).await?;
+        self.tasks
+            .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                relmap_key, relmap_buf,
+            )));
+
+        // Import reldir (00:spcnode:dbnode:00:*:01)
+        let reldir_key = rel_dir_to_key(db.spcnode, db.dboid);
+        debug!("Constructing reldirs entry, key {reldir_key}");
+        let reldir_buf = RelDirectory::ser(&RelDirectory {
+            rels: db
+                .files
+                .iter()
+                .map(|f| (f.rel_tag.relnode, f.rel_tag.forknum))
+                .collect(),
+        })?;
+        self.tasks
+            .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                reldir_key,
+                Bytes::from(reldir_buf),
+            )));
+
+        // Import data (00:spcnode:dbnode:reloid:fork:blk) and set sizes for each last
+        // segment in a given relation (00:spcnode:dbnode:reloid:fork:ff)
+        for file in &db.files {
+            debug!(%file.path, %file.filesize, "importing file");
+            let len = file.filesize;
+            ensure!(len % 8192 == 0);
+            let start_blk: u32 = file.segno * (1024 * 1024 * 1024 / 8192);
+            let start_key = rel_block_to_key(file.rel_tag, start_blk);
+            let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32);
+            self.tasks
+                .push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new(
+                    *self.timeline.get_shard_identity(),
+                    start_key..end_key,
+                    &file.path,
+                    self.storage.clone(),
+                )));
+
+            // Set relsize for the last segment (00:spcnode:dbnode:reloid:fork:ff)
+            if let Some(nblocks) = file.nblocks {
+                let size_key = rel_size_to_key(file.rel_tag);
+                //debug!("Setting relation size (path={path}, rel_tag={rel_tag}, segno={segno}) to {nblocks}, key {size_key}");
+                let buf = nblocks.to_le_bytes();
+                self.tasks
+                    .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                        size_key,
+                        Bytes::from(buf.to_vec()),
+                    )));
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> {
+        let segments = self.storage.listfilesindir(path).await?;
+        let segments: Vec<(String, u32, usize)> = segments
+            .into_iter()
+            .filter_map(|(path, size)| {
+                let filename = path.object_name()?;
+                let segno = u32::from_str_radix(filename, 16).ok()?;
+                Some((filename.to_string(), segno, size))
+            })
+            .collect();
+
+        // Write SlruDir
+        let slrudir_key = slru_dir_to_key(kind);
+        let segnos: HashSet<u32> = segments
+            .iter()
+            .map(|(_path, segno, _size)| *segno)
+            .collect();
+        let slrudir = SlruSegmentDirectory { segments: segnos };
+        let slrudir_buf = SlruSegmentDirectory::ser(&slrudir)?;
+        self.tasks
+            .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                slrudir_key,
+                Bytes::from(slrudir_buf),
+            )));
+
+        for (segpath, segno, size) in segments {
+            // SlruSegBlocks for each segment
+            let p = path.join(&segpath);
+            let file_size = size;
+            ensure!(file_size % 8192 == 0);
+            let nblocks = u32::try_from(file_size / 8192)?;
+            let start_key = slru_block_to_key(kind, segno, 0);
+            let end_key = slru_block_to_key(kind, segno, nblocks);
+            debug!(%p, segno=%segno, %size, %start_key, %end_key, "scheduling SLRU segment");
+            self.tasks
+                .push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new(
+                    *self.timeline.get_shard_identity(),
+                    start_key..end_key,
+                    &p,
+                    self.storage.clone(),
+                )));
+
+            // Followed by SlruSegSize
+            let segsize_key = slru_segment_size_to_key(kind, segno);
+            let segsize_buf = nblocks.to_le_bytes();
+            self.tasks
+                .push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(
+                    segsize_key,
+                    Bytes::copy_from_slice(&segsize_buf),
+                )));
+        }
+        Ok(())
+    }
+}
+
+//
+// dbdir iteration tools
+//
+
+struct PgDataDir {
+    pub dbs: Vec<PgDataDirDb>, // spcnode, dboid, path
+}
+
+struct PgDataDirDb {
+    pub spcnode: u32,
+    pub dboid: u32,
+    pub path: RemotePath,
+    pub files: Vec<PgDataDirDbFile>,
+}
+
+struct PgDataDirDbFile {
+    pub path: RemotePath,
+    pub rel_tag: RelTag,
+    pub segno: u32,
+    pub filesize: usize,
+    // Cummulative size of the given fork, set only for the last segment of that fork
+    pub nblocks: Option<usize>,
+}
+
+impl PgDataDir {
+    async fn new(storage: &RemoteStorageWrapper) -> anyhow::Result<Self> {
+        let datadir_path = storage.pgdata();
+        // Import ordinary databases, DEFAULTTABLESPACE_OID is smaller than GLOBALTABLESPACE_OID, so import them first
+        // Traverse database in increasing oid order
+
+        let basedir = &datadir_path.join("base");
+        let db_oids: Vec<_> = storage
+            .listdir(basedir)
+            .await?
+            .into_iter()
+            .filter_map(|path| path.object_name().and_then(|name| name.parse::<u32>().ok()))
+            .sorted()
+            .collect();
+        debug!(?db_oids, "found databases");
+        let mut databases = Vec::new();
+        for dboid in db_oids {
+            databases.push(
+                PgDataDirDb::new(
+                    storage,
+                    &basedir.join(dboid.to_string()),
+                    pg_constants::DEFAULTTABLESPACE_OID,
+                    dboid,
+                    &datadir_path,
+                )
+                .await?,
+            );
+        }
+
+        // special case for global catalogs
+        databases.push(
+            PgDataDirDb::new(
+                storage,
+                &datadir_path.join("global"),
+                postgres_ffi::pg_constants::GLOBALTABLESPACE_OID,
+                0,
+                &datadir_path,
+            )
+            .await?,
+        );
+
+        databases.sort_by_key(|db| (db.spcnode, db.dboid));
+
+        Ok(Self { dbs: databases })
+    }
+}
+
+impl PgDataDirDb {
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%dboid, %db_path))]
+    async fn new(
+        storage: &RemoteStorageWrapper,
+        db_path: &RemotePath,
+        spcnode: u32,
+        dboid: u32,
+        datadir_path: &RemotePath,
+    ) -> anyhow::Result<Self> {
+        let mut files: Vec<PgDataDirDbFile> = storage
+            .listfilesindir(db_path)
+            .await?
+            .into_iter()
+            .filter_map(|(path, size)| {
+                debug!(%path, %size, "found file in dbdir");
+                path.object_name().and_then(|name| {
+                    // returns (relnode, forknum, segno)
+                    parse_relfilename(name).ok().map(|x| (size, x))
+                })
+            })
+            .sorted_by_key(|(_, relfilename)| *relfilename)
+            .map(|(filesize, (relnode, forknum, segno))| {
+                let rel_tag = RelTag {
+                    spcnode,
+                    dbnode: dboid,
+                    relnode,
+                    forknum,
+                };
+
+                let path = datadir_path.join(rel_tag.to_segfile_name(segno));
+                assert!(filesize % BLCKSZ as usize == 0); // TODO: this should result in an error
+                let nblocks = filesize / BLCKSZ as usize;
+
+                PgDataDirDbFile {
+                    path,
+                    filesize,
+                    rel_tag,
+                    segno,
+                    nblocks: Some(nblocks), // first non-cummulative sizes
+                }
+            })
+            .collect();
+
+        // Set cummulative sizes. Do all of that math here, so that later we could easier
+        // parallelize over segments and know with which segments we need to write relsize
+        // entry.
+        let mut cumulative_nblocks: usize = 0;
+        let mut prev_rel_tag: Option<RelTag> = None;
+        for i in 0..files.len() {
+            if prev_rel_tag == Some(files[i].rel_tag) {
+                cumulative_nblocks += files[i].nblocks.unwrap();
+            } else {
+                cumulative_nblocks = files[i].nblocks.unwrap();
+            }
+
+            files[i].nblocks = if i == files.len() - 1 || files[i + 1].rel_tag != files[i].rel_tag {
+                Some(cumulative_nblocks)
+            } else {
+                None
+            };
+
+            prev_rel_tag = Some(files[i].rel_tag);
+        }
+
+        Ok(PgDataDirDb {
+            files,
+            path: db_path.clone(),
+            spcnode,
+            dboid,
+        })
+    }
+}
+
+trait ImportTask {
+    fn key_range(&self) -> Range<Key>;
+
+    fn total_size(&self) -> usize {
+        // TODO: revisit this
+        if is_contiguous_range(&self.key_range()) {
+            contiguous_range_len(&self.key_range()) as usize * 8192
+        } else {
+            u32::MAX as usize
+        }
+    }
+
+    async fn doit(
+        self,
+        layer_writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize>;
+}
+
+struct ImportSingleKeyTask {
+    key: Key,
+    buf: Bytes,
+}
+
+impl ImportSingleKeyTask {
+    fn new(key: Key, buf: Bytes) -> Self {
+        ImportSingleKeyTask { key, buf }
+    }
+}
+
+impl ImportTask for ImportSingleKeyTask {
+    fn key_range(&self) -> Range<Key> {
+        singleton_range(self.key)
+    }
+
+    async fn doit(
+        self,
+        layer_writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        layer_writer.put_image(self.key, self.buf, ctx).await?;
+        Ok(1)
+    }
+}
+
+struct ImportRelBlocksTask {
+    shard_identity: ShardIdentity,
+    key_range: Range<Key>,
+    path: RemotePath,
+    storage: RemoteStorageWrapper,
+}
+
+impl ImportRelBlocksTask {
+    fn new(
+        shard_identity: ShardIdentity,
+        key_range: Range<Key>,
+        path: &RemotePath,
+        storage: RemoteStorageWrapper,
+    ) -> Self {
+        ImportRelBlocksTask {
+            shard_identity,
+            key_range,
+            path: path.clone(),
+            storage,
+        }
+    }
+}
+
+impl ImportTask for ImportRelBlocksTask {
+    fn key_range(&self) -> Range<Key> {
+        self.key_range.clone()
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%self.path))]
+    async fn doit(
+        self,
+        layer_writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        debug!("Importing relation file");
+
+        let (rel_tag, start_blk) = self.key_range.start.to_rel_block()?;
+        let (rel_tag_end, end_blk) = self.key_range.end.to_rel_block()?;
+        assert_eq!(rel_tag, rel_tag_end);
+
+        let ranges = (start_blk..end_blk)
+            .enumerate()
+            .filter_map(|(i, blknum)| {
+                let key = rel_block_to_key(rel_tag, blknum);
+                if self.shard_identity.is_key_disposable(&key) {
+                    return None;
+                }
+                let file_offset = i.checked_mul(8192).unwrap();
+                Some((
+                    vec![key],
+                    file_offset,
+                    file_offset.checked_add(8192).unwrap(),
+                ))
+            })
+            .coalesce(|(mut acc, acc_start, acc_end), (mut key, start, end)| {
+                assert_eq!(key.len(), 1);
+                assert!(!acc.is_empty());
+                assert!(acc_end > acc_start);
+                if acc_end == start /* TODO additional max range check here, to limit memory consumption per task to X */ {
+                    acc.push(key.pop().unwrap());
+                    Ok((acc, acc_start, end))
+                } else {
+                    Err(((acc, acc_start, acc_end), (key, start, end)))
+                }
+            });
+
+        let mut nimages = 0;
+        for (keys, range_start, range_end) in ranges {
+            let range_buf = self
+                .storage
+                .get_range(&self.path, range_start.into_u64(), range_end.into_u64())
+                .await?;
+            let mut buf = Bytes::from(range_buf);
+            // TODO: batched writes
+            for key in keys {
+                let image = buf.split_to(8192);
+                layer_writer.put_image(key, image, ctx).await?;
+                nimages += 1;
+            }
+        }
+
+        Ok(nimages)
+    }
+}
+
+struct ImportSlruBlocksTask {
+    shard_identity: ShardIdentity,
+    key_range: Range<Key>,
+    path: RemotePath,
+    storage: RemoteStorageWrapper,
+}
+
+impl ImportSlruBlocksTask {
+    fn new(
+        shard_identity: ShardIdentity,
+        key_range: Range<Key>,
+        path: &RemotePath,
+        storage: RemoteStorageWrapper,
+    ) -> Self {
+        ImportSlruBlocksTask {
+            shard_identity,
+            key_range,
+            path: path.clone(),
+            storage,
+        }
+    }
+}
+
+impl ImportTask for ImportSlruBlocksTask {
+    fn key_range(&self) -> Range<Key> {
+        self.key_range.clone()
+    }
+
+    async fn doit(
+        self,
+        layer_writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        debug!("Importing SLRU segment file {}", self.path);
+        let buf = self.storage.get(&self.path).await?;
+
+        let (kind, segno, start_blk) = self.key_range.start.to_slru_block()?;
+        let (_kind, _segno, end_blk) = self.key_range.end.to_slru_block()?;
+        let mut blknum = start_blk;
+        let mut nimages = 0;
+        let mut file_offset = 0;
+        while blknum < end_blk {
+            let key = slru_block_to_key(kind, segno, blknum);
+            assert!(
+                !self.shard_identity.is_key_disposable(&key),
+                "SLRU keys need to go into every shard"
+            );
+            let buf = &buf[file_offset..(file_offset + 8192)];
+            file_offset += 8192;
+            layer_writer
+                .put_image(key, Bytes::copy_from_slice(buf), ctx)
+                .await?;
+            blknum += 1;
+            nimages += 1;
+        }
+        Ok(nimages)
+    }
+}
+
+enum AnyImportTask {
+    SingleKey(ImportSingleKeyTask),
+    RelBlocks(ImportRelBlocksTask),
+    SlruBlocks(ImportSlruBlocksTask),
+}
+
+impl ImportTask for AnyImportTask {
+    fn key_range(&self) -> Range<Key> {
+        match self {
+            Self::SingleKey(t) => t.key_range(),
+            Self::RelBlocks(t) => t.key_range(),
+            Self::SlruBlocks(t) => t.key_range(),
+        }
+    }
+    /// returns the number of images put into the `layer_writer`
+    async fn doit(
+        self,
+        layer_writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        match self {
+            Self::SingleKey(t) => t.doit(layer_writer, ctx).await,
+            Self::RelBlocks(t) => t.doit(layer_writer, ctx).await,
+            Self::SlruBlocks(t) => t.doit(layer_writer, ctx).await,
+        }
+    }
+}
+
+impl From<ImportSingleKeyTask> for AnyImportTask {
+    fn from(t: ImportSingleKeyTask) -> Self {
+        Self::SingleKey(t)
+    }
+}
+
+impl From<ImportRelBlocksTask> for AnyImportTask {
+    fn from(t: ImportRelBlocksTask) -> Self {
+        Self::RelBlocks(t)
+    }
+}
+
+impl From<ImportSlruBlocksTask> for AnyImportTask {
+    fn from(t: ImportSlruBlocksTask) -> Self {
+        Self::SlruBlocks(t)
+    }
+}
+
+struct ChunkProcessingJob {
+    timeline: Arc<Timeline>,
+    range: Range<Key>,
+    tasks: Vec<AnyImportTask>,
+
+    pgdata_lsn: Lsn,
+}
+
+impl ChunkProcessingJob {
+    fn new(range: Range<Key>, tasks: Vec<AnyImportTask>, env: &Flow) -> Self {
+        assert!(env.pgdata_lsn.is_valid());
+        Self {
+            timeline: env.timeline.clone(),
+            range,
+            tasks,
+            pgdata_lsn: env.pgdata_lsn,
+        }
+    }
+
+    async fn run(self, ctx: &RequestContext) -> anyhow::Result<()> {
+        let mut writer = ImageLayerWriter::new(
+            self.timeline.conf,
+            self.timeline.timeline_id,
+            self.timeline.tenant_shard_id,
+            &self.range,
+            self.pgdata_lsn,
+            ctx,
+        )
+        .await?;
+
+        let mut nimages = 0;
+        for task in self.tasks {
+            nimages += task.doit(&mut writer, ctx).await?;
+        }
+
+        let resident_layer = if nimages > 0 {
+            let (desc, path) = writer.finish(ctx).await?;
+            Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?
+        } else {
+            // dropping the writer cleans up
+            return Ok(());
+        };
+
+        // this is sharing the same code as create_image_layers
+        let mut guard = self.timeline.layers.write().await;
+        guard
+            .open_mut()?
+            .track_new_image_layers(&[resident_layer.clone()], &self.timeline.metrics);
+        crate::tenant::timeline::drop_wlock(guard);
+
+        // Schedule the layer for upload but don't add barriers such as
+        // wait for completion or index upload, so we don't inhibit upload parallelism.
+        // TODO: limit upload parallelism somehow (e.g. by limiting concurrency of jobs?)
+        // TODO: or regulate parallelism by upload queue depth? Prob should happen at a higher level.
+        self.timeline
+            .remote_client
+            .schedule_layer_file_upload(resident_layer)?;
+
+        Ok(())
+    }
+}
--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
@@ -0,0 +1,315 @@
+use std::{ops::Bound, sync::Arc};
+
+use anyhow::Context;
+use bytes::Bytes;
+use postgres_ffi::ControlFileData;
+use remote_storage::{
+    Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingObject, RemotePath,
+};
+use serde::de::DeserializeOwned;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, info, instrument};
+use utils::lsn::Lsn;
+
+use crate::{assert_u64_eq_usize::U64IsUsize, config::PageServerConf};
+
+use super::{importbucket_format, index_part_format};
+
+pub async fn new(
+    conf: &'static PageServerConf,
+    location: &index_part_format::Location,
+    cancel: CancellationToken,
+) -> Result<RemoteStorageWrapper, anyhow::Error> {
+    // FIXME: we probably want some timeout, and we might be able to assume the max file
+    // size on S3 is 1GiB (postgres segment size). But the problem is that the individual
+    // downloaders don't know enough about concurrent downloads to make a guess on the
+    // expected bandwidth and resulting best timeout.
+    let timeout = std::time::Duration::from_secs(24 * 60 * 60);
+    let location_storage = match location {
+        #[cfg(feature = "testing")]
+        index_part_format::Location::LocalFs { path } => {
+            GenericRemoteStorage::LocalFs(remote_storage::LocalFs::new(path.clone(), timeout)?)
+        }
+        index_part_format::Location::AwsS3 {
+            region,
+            bucket,
+            key,
+        } => {
+            // TODO: think about security implications of letting the client specify the bucket & prefix.
+            // It's the most flexible right now, but, possibly we want to move bucket name into PS conf
+            // and force the timeline_id into the prefix?
+            GenericRemoteStorage::AwsS3(Arc::new(
+                remote_storage::S3Bucket::new(
+                    &remote_storage::S3Config {
+                        bucket_name: bucket.clone(),
+                        prefix_in_bucket: Some(key.clone()),
+                        bucket_region: region.clone(),
+                        endpoint: conf
+                            .import_pgdata_aws_endpoint_url
+                            .clone()
+                            .map(|url| url.to_string()), //  by specifying None here, remote_storage/aws-sdk-rust will infer from env
+                        concurrency_limit: 100.try_into().unwrap(), // TODO: think about this
+                        max_keys_per_list_response: Some(1000),     // TODO: think about this
+                        upload_storage_class: None,                 // irrelevant
+                    },
+                    timeout,
+                )
+                .await
+                .context("setup s3 bucket")?,
+            ))
+        }
+    };
+    let storage_wrapper = RemoteStorageWrapper::new(location_storage, cancel);
+    Ok(storage_wrapper)
+}
+
+/// Wrap [`remote_storage`] APIs to make it look a bit more like a filesystem API
+/// such as [`tokio::fs`], which was used in the original implementation of the import code.
+#[derive(Clone)]
+pub struct RemoteStorageWrapper {
+    storage: GenericRemoteStorage,
+    cancel: CancellationToken,
+}
+
+impl RemoteStorageWrapper {
+    pub fn new(storage: GenericRemoteStorage, cancel: CancellationToken) -> Self {
+        Self { storage, cancel }
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
+    pub async fn listfilesindir(
+        &self,
+        path: &RemotePath,
+    ) -> Result<Vec<(RemotePath, usize)>, DownloadError> {
+        assert!(
+            path.object_name().is_some(),
+            "must specify dirname, without trailing slash"
+        );
+        let path = path.add_trailing_slash();
+
+        let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
+            || async {
+                let Listing { keys, prefixes: _ } = self
+                    .storage
+                    .list(
+                        Some(&path),
+                        remote_storage::ListingMode::WithDelimiter,
+                        None,
+                        &self.cancel,
+                    )
+                    .await?;
+                let res = keys
+                    .into_iter()
+                    .map(|ListingObject { key, size, .. }| (key, size.into_usize()))
+                    .collect();
+                Ok(res)
+            },
+            &format!("listfilesindir {path:?}"),
+            &self.cancel,
+        )
+        .await;
+        debug!(?res, "returning");
+        res
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
+    pub async fn listdir(&self, path: &RemotePath) -> Result<Vec<RemotePath>, DownloadError> {
+        assert!(
+            path.object_name().is_some(),
+            "must specify dirname, without trailing slash"
+        );
+        let path = path.add_trailing_slash();
+
+        let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
+            || async {
+                let Listing { keys, prefixes } = self
+                    .storage
+                    .list(
+                        Some(&path),
+                        remote_storage::ListingMode::WithDelimiter,
+                        None,
+                        &self.cancel,
+                    )
+                    .await?;
+                let res = keys
+                    .into_iter()
+                    .map(|ListingObject { key, .. }| key)
+                    .chain(prefixes.into_iter())
+                    .collect();
+                Ok(res)
+            },
+            &format!("listdir {path:?}"),
+            &self.cancel,
+        )
+        .await;
+        debug!(?res, "returning");
+        res
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
+    pub async fn get(&self, path: &RemotePath) -> Result<Bytes, DownloadError> {
+        let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
+            || async {
+                let Download {
+                    download_stream, ..
+                } = self
+                    .storage
+                    .download(path, &DownloadOpts::default(), &self.cancel)
+                    .await?;
+                let mut reader = tokio_util::io::StreamReader::new(download_stream);
+
+                // XXX optimize this, can we get the capacity hint from somewhere?
+                let mut buf = Vec::new();
+                tokio::io::copy_buf(&mut reader, &mut buf).await?;
+                Ok(Bytes::from(buf))
+            },
+            &format!("download {path:?}"),
+            &self.cancel,
+        )
+        .await;
+        debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done");
+        res
+    }
+
+    pub async fn get_spec(&self) -> Result<Option<importbucket_format::Spec>, anyhow::Error> {
+        self.get_json(&RemotePath::from_string("spec.json").unwrap())
+            .await
+            .context("get spec")
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
+    pub async fn get_json<T: DeserializeOwned>(
+        &self,
+        path: &RemotePath,
+    ) -> Result<Option<T>, DownloadError> {
+        let buf = match self.get(path).await {
+            Ok(buf) => buf,
+            Err(DownloadError::NotFound) => return Ok(None),
+            Err(err) => return Err(err),
+        };
+        let res = serde_json::from_slice(&buf)
+            .context("serialize")
+            // TODO: own error type
+            .map_err(DownloadError::Other)?;
+        Ok(Some(res))
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
+    pub async fn put_json<T>(&self, path: &RemotePath, value: &T) -> anyhow::Result<()>
+    where
+        T: serde::Serialize,
+    {
+        let buf = serde_json::to_vec(value)?;
+        let bytes = Bytes::from(buf);
+        utils::backoff::retry(
+            || async {
+                let size = bytes.len();
+                let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
+                self.storage
+                    .upload_storage_object(bytes, size, path, &self.cancel)
+                    .await
+            },
+            remote_storage::TimeoutOrCancel::caused_by_cancel,
+            1,
+            u32::MAX,
+            &format!("put json {path}"),
+            &self.cancel,
+        )
+        .await
+        .expect("practically infinite retries")
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
+    pub async fn get_range(
+        &self,
+        path: &RemotePath,
+        start_inclusive: u64,
+        end_exclusive: u64,
+    ) -> Result<Vec<u8>, DownloadError> {
+        let len = end_exclusive
+            .checked_sub(start_inclusive)
+            .unwrap()
+            .into_usize();
+        let res = crate::tenant::remote_timeline_client::download::download_retry_forever(
+            || async {
+                let Download {
+                    download_stream, ..
+                } = self
+                    .storage
+                    .download(
+                        path,
+                        &DownloadOpts {
+                            etag: None,
+                            byte_start: Bound::Included(start_inclusive),
+                            byte_end: Bound::Excluded(end_exclusive)
+                        },
+                        &self.cancel)
+                    .await?;
+                let mut reader = tokio_util::io::StreamReader::new(download_stream);
+
+                let mut buf = Vec::with_capacity(len);
+                tokio::io::copy_buf(&mut reader, &mut buf).await?;
+                Ok(buf)
+            },
+            &format!("download range len=0x{len:x} [0x{start_inclusive:x},0x{end_exclusive:x}) from {path:?}"),
+            &self.cancel,
+        )
+        .await;
+        debug!(len = res.as_ref().ok().map(|buf| buf.len()), "done");
+        res
+    }
+
+    pub fn pgdata(&self) -> RemotePath {
+        RemotePath::from_string("pgdata").unwrap()
+    }
+
+    pub async fn get_control_file(&self) -> Result<ControlFile, anyhow::Error> {
+        let control_file_path = self.pgdata().join("global/pg_control");
+        info!("get control file from {control_file_path}");
+        let control_file_buf = self.get(&control_file_path).await?;
+        ControlFile::new(control_file_buf)
+    }
+}
+
+pub struct ControlFile {
+    control_file_data: ControlFileData,
+    control_file_buf: Bytes,
+}
+
+impl ControlFile {
+    pub(crate) fn new(control_file_buf: Bytes) -> Result<Self, anyhow::Error> {
+        // XXX ControlFileData is version-specific, we're always using v14 here. v17 had changes.
+        let control_file_data = ControlFileData::decode(&control_file_buf)?;
+        let control_file = ControlFile {
+            control_file_data,
+            control_file_buf,
+        };
+        control_file.try_pg_version()?; // so that we can offer infallible pg_version()
+        Ok(control_file)
+    }
+    pub(crate) fn base_lsn(&self) -> Lsn {
+        Lsn(self.control_file_data.checkPoint).align()
+    }
+    pub(crate) fn pg_version(&self) -> u32 {
+        self.try_pg_version()
+            .expect("prepare() checks that try_pg_version doesn't error")
+    }
+    pub(crate) fn control_file_data(&self) -> &ControlFileData {
+        &self.control_file_data
+    }
+    pub(crate) fn control_file_buf(&self) -> &Bytes {
+        &self.control_file_buf
+    }
+    fn try_pg_version(&self) -> anyhow::Result<u32> {
+        Ok(match self.control_file_data.catalog_version_no {
+            // thesea are from catversion.h
+            202107181 => 14,
+            202209061 => 15,
+            202307071 => 16,
+            /* XXX pg17 */
+            catversion => {
+                anyhow::bail!("unrecognized catalog version {catversion}")
+            }
+        })
+    }
+}
--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs
@@ -0,0 +1,20 @@
+use serde::{Deserialize, Serialize};
+
+#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
+pub struct PgdataStatus {
+    pub done: bool,
+    // TODO: remaining fields
+}
+
+#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
+pub struct ShardStatus {
+    pub done: bool,
+    // TODO: remaining fields
+}
+
+// TODO: dedupe with fast_import code
+#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
+pub struct Spec {
+    pub project_id: String,
+    pub branch_id: String,
+}
--- a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs
@@ -0,0 +1,68 @@
+use serde::{Deserialize, Serialize};
+
+#[cfg(feature = "testing")]
+use camino::Utf8PathBuf;
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub enum Root {
+    V1(V1),
+}
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub enum V1 {
+    InProgress(InProgress),
+    Done(Done),
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+#[serde(transparent)]
+pub struct IdempotencyKey(String);
+
+impl IdempotencyKey {
+    pub fn new(s: String) -> Self {
+        Self(s)
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub struct InProgress {
+    pub idempotency_key: IdempotencyKey,
+    pub location: Location,
+    pub started_at: chrono::NaiveDateTime,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub struct Done {
+    pub idempotency_key: IdempotencyKey,
+    pub started_at: chrono::NaiveDateTime,
+    pub finished_at: chrono::NaiveDateTime,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub enum Location {
+    #[cfg(feature = "testing")]
+    LocalFs { path: Utf8PathBuf },
+    AwsS3 {
+        region: String,
+        bucket: String,
+        key: String,
+    },
+}
+
+impl Root {
+    pub fn is_done(&self) -> bool {
+        match self {
+            Root::V1(v1) => match v1 {
+                V1::Done(_) => true,
+                V1::InProgress(_) => false,
+            },
+        }
+    }
+    pub fn idempotency_key(&self) -> &IdempotencyKey {
+        match self {
+            Root::V1(v1) => match v1 {
+                V1::InProgress(in_progress) => &in_progress.idempotency_key,
+                V1::Done(done) => &done.idempotency_key,
+            },
+        }
+    }
+}
--- a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
@@ -0,0 +1,119 @@
+//! FIXME: most of this is copy-paste from mgmt_api.rs ; dedupe into a `reqwest_utils::Client` crate.
+use pageserver_client::mgmt_api::{Error, ResponseErrorMessageExt};
+use serde::{Deserialize, Serialize};
+use tokio_util::sync::CancellationToken;
+use tracing::error;
+
+use crate::config::PageServerConf;
+use reqwest::Method;
+
+use super::importbucket_format::Spec;
+
+pub struct Client {
+    base_url: String,
+    authorization_header: Option<String>,
+    client: reqwest::Client,
+    cancel: CancellationToken,
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+#[derive(Serialize, Deserialize, Debug)]
+struct ImportProgressRequest {
+    // no fields yet, not sure if there every will be any
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+struct ImportProgressResponse {
+    // we don't care
+}
+
+impl Client {
+    pub fn new(conf: &PageServerConf, cancel: CancellationToken) -> anyhow::Result<Self> {
+        let Some(ref base_url) = conf.import_pgdata_upcall_api else {
+            anyhow::bail!("import_pgdata_upcall_api is not configured")
+        };
+        Ok(Self {
+            base_url: base_url.to_string(),
+            client: reqwest::Client::new(),
+            cancel,
+            authorization_header: conf
+                .import_pgdata_upcall_api_token
+                .as_ref()
+                .map(|secret_string| secret_string.get_contents())
+                .map(|jwt| format!("Bearer {jwt}")),
+        })
+    }
+
+    fn start_request<U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+    ) -> reqwest::RequestBuilder {
+        let req = self.client.request(method, uri);
+        if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        }
+    }
+
+    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+        body: B,
+    ) -> Result<reqwest::Response> {
+        self.start_request(method, uri)
+            .json(&body)
+            .send()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+        body: B,
+    ) -> Result<reqwest::Response> {
+        let res = self.request_noerror(method, uri, body).await?;
+        let response = res.error_from_body().await?;
+        Ok(response)
+    }
+
+    pub async fn send_progress_once(&self, spec: &Spec) -> Result<()> {
+        let url = format!(
+            "{}/projects/{}/branches/{}/import_progress",
+            self.base_url, spec.project_id, spec.branch_id
+        );
+        let ImportProgressResponse {} = self
+            .request(Method::POST, url, &ImportProgressRequest {})
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)?;
+        Ok(())
+    }
+
+    pub async fn send_progress_until_success(&self, spec: &Spec) -> anyhow::Result<()> {
+        loop {
+            match self.send_progress_once(spec).await {
+                Ok(()) => return Ok(()),
+                Err(Error::Cancelled) => return Err(anyhow::anyhow!("cancelled")),
+                Err(err) => {
+                    error!(?err, "error sending progress, retrying");
+                    if tokio::time::timeout(
+                        std::time::Duration::from_secs(10),
+                        self.cancel.cancelled(),
+                    )
+                    .await
+                    .is_ok()
+                    {
+                        anyhow::bail!("cancelled while sending early progress update");
+                    }
+                }
+            }
+        }
+    }
+}
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -58,7 +58,7 @@ pub(crate) async fn offload_timeline(
    }

    // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-    timeline.shutdown(super::ShutdownMode::Flush).await;
+    timeline.shutdown(super::ShutdownMode::Reload).await;

    // TODO extend guard mechanism above with method
    // to make deletions possible while offloading is in progress
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -3,7 +3,7 @@ use std::{collections::hash_map::Entry, fs, sync::Arc};
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use tracing::{error, info, info_span};
-use utils::{fs_ext, id::TimelineId, lsn::Lsn};
+use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard};

 use crate::{
    context::RequestContext,
@@ -23,14 +23,14 @@ use super::Timeline;
 pub struct UninitializedTimeline<'t> {
    pub(crate) owning_tenant: &'t Tenant,
    timeline_id: TimelineId,
-    raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
+    raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
 }

 impl<'t> UninitializedTimeline<'t> {
    pub(crate) fn new(
        owning_tenant: &'t Tenant,
        timeline_id: TimelineId,
-        raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
+        raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
    ) -> Self {
        Self {
            owning_tenant,
@@ -87,6 +87,10 @@ impl<'t> UninitializedTimeline<'t> {
        }
    }

+    pub(crate) fn finish_creation_myself(&mut self) -> (Arc<Timeline>, TimelineCreateGuard) {
+        self.raw_timeline.take().expect("already checked")
+    }
+
    /// Prepares timeline data by loading it from the basebackup archive.
    pub(crate) async fn import_basebackup_from_tar(
        self,
@@ -167,9 +171,10 @@ pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) {
 /// A guard for timeline creations in process: as long as this object exists, the timeline ID
 /// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline.
 #[must_use]
-pub(crate) struct TimelineCreateGuard<'t> {
-    owning_tenant: &'t Tenant,
-    timeline_id: TimelineId,
+pub(crate) struct TimelineCreateGuard {
+    pub(crate) _tenant_gate_guard: GateGuard,
+    pub(crate) owning_tenant: Arc<Tenant>,
+    pub(crate) timeline_id: TimelineId,
    pub(crate) timeline_path: Utf8PathBuf,
    pub(crate) idempotency: CreateTimelineIdempotency,
 }
@@ -184,20 +189,27 @@ pub(crate) enum TimelineExclusionError {
    },
    #[error("Already creating")]
    AlreadyCreating,
+    #[error("Shutting down")]
+    ShuttingDown,

    // e.g. I/O errors, or some failure deep in postgres initdb
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }

-impl<'t> TimelineCreateGuard<'t> {
+impl TimelineCreateGuard {
    pub(crate) fn new(
-        owning_tenant: &'t Tenant,
+        owning_tenant: &Arc<Tenant>,
        timeline_id: TimelineId,
        timeline_path: Utf8PathBuf,
        idempotency: CreateTimelineIdempotency,
        allow_offloaded: bool,
    ) -> Result<Self, TimelineExclusionError> {
+        let _tenant_gate_guard = owning_tenant
+            .gate
+            .enter()
+            .map_err(|_| TimelineExclusionError::ShuttingDown)?;
+
        // Lock order: this is the only place we take both locks.  During drop() we only
        // lock creating_timelines
        let timelines = owning_tenant.timelines.lock().unwrap();
@@ -225,8 +237,12 @@ impl<'t> TimelineCreateGuard<'t> {
            return Err(TimelineExclusionError::AlreadyCreating);
        }
        creating_timelines.insert(timeline_id);
+        drop(creating_timelines);
+        drop(timelines_offloaded);
+        drop(timelines);
        Ok(Self {
-            owning_tenant,
+            _tenant_gate_guard,
+            owning_tenant: Arc::clone(owning_tenant),
            timeline_id,
            timeline_path,
            idempotency,
@@ -234,7 +250,7 @@ impl<'t> TimelineCreateGuard<'t> {
    }
 }

-impl Drop for TimelineCreateGuard<'_> {
+impl Drop for TimelineCreateGuard {
    fn drop(&mut self) {
        self.owning_tenant
            .timelines_creating
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -38,6 +38,7 @@ use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::postgres_client::PostgresClientProtocol;

 use self::connection_manager::ConnectionManagerStatus;

@@ -45,6 +46,7 @@ use super::Timeline;

 #[derive(Clone)]
 pub struct WalReceiverConf {
+    pub protocol: PostgresClientProtocol,
    /// The timeout on the connection to safekeeper for WAL streaming.
    pub wal_connect_timeout: Duration,
    /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one.
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -36,7 +36,9 @@ use postgres_connection::PgConnectionConfig;
 use utils::backoff::{
    exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
-use utils::postgres_client::wal_stream_connection_config;
+use utils::postgres_client::{
+    wal_stream_connection_config, ConnectionConfigArgs, PostgresClientProtocol,
+};
 use utils::{
    id::{NodeId, TenantTimelineId},
    lsn::Lsn,
@@ -533,6 +535,7 @@ impl ConnectionManagerState {
        let node_id = new_sk.safekeeper_id;
        let connect_timeout = self.conf.wal_connect_timeout;
        let ingest_batch_size = self.conf.ingest_batch_size;
+        let protocol = self.conf.protocol;
        let timeline = Arc::clone(&self.timeline);
        let ctx = ctx.detached_child(
            TaskKind::WalReceiverConnectionHandler,
@@ -546,6 +549,7 @@ impl ConnectionManagerState {

                let res = super::walreceiver_connection::handle_walreceiver_connection(
                    timeline,
+                    protocol,
                    new_sk.wal_source_connconf,
                    events_sender,
                    cancellation.clone(),
@@ -984,15 +988,33 @@ impl ConnectionManagerState {
                if info.safekeeper_connstr.is_empty() {
                    return None; // no connection string, ignore sk
                }
-                match wal_stream_connection_config(
-                    self.id,
-                    info.safekeeper_connstr.as_ref(),
-                    match &self.conf.auth_token {
-                        None => None,
-                        Some(x) => Some(x),
+
+                let (shard_number, shard_count, shard_stripe_size) = match self.conf.protocol {
+                    PostgresClientProtocol::Vanilla => {
+                        (None, None, None)
                    },
-                    self.conf.availability_zone.as_deref(),
-                ) {
+                    PostgresClientProtocol::Interpreted { .. } => {
+                        let shard_identity = self.timeline.get_shard_identity();
+                        (
+                            Some(shard_identity.number.0),
+                            Some(shard_identity.count.0),
+                            Some(shard_identity.stripe_size.0),
+                        )
+                    }
+                };
+
+                let connection_conf_args = ConnectionConfigArgs {
+                    protocol: self.conf.protocol,
+                    ttid: self.id,
+                    shard_number,
+                    shard_count,
+                    shard_stripe_size,
+                    listen_pg_addr_str: info.safekeeper_connstr.as_ref(),
+                    auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()),
+                    availability_zone: self.conf.availability_zone.as_deref()
+                };
+
+                match wal_stream_connection_config(connection_conf_args) {
                    Ok(connstr) => Some((*sk_id, info, connstr)),
                    Err(e) => {
                        error!("Failed to create wal receiver connection string from broker data of safekeeper node {}: {e:#}", sk_id);
@@ -1096,6 +1118,7 @@ impl ReconnectReason {
 mod tests {
    use super::*;
    use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
+    use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;
    use url::Host;

    fn dummy_broker_sk_timeline(
@@ -1532,6 +1555,7 @@ mod tests {
            timeline,
            cancel: CancellationToken::new(),
            conf: WalReceiverConf {
+                protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
                wal_connect_timeout: Duration::from_secs(1),
                lagging_wal_timeout: Duration::from_secs(1),
                max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -22,7 +22,10 @@ use tokio::{select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn, Instrument};
-use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord};
+use wal_decoder::{
+    models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords},
+    wire_format::FromWireFormat,
+};

 use super::TaskStateUpdate;
 use crate::{
@@ -36,7 +39,7 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use utils::{id::NodeId, lsn::Lsn};
+use utils::{id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
 use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};

 /// Status of the connection.
@@ -109,6 +112,7 @@ impl From<WalDecodeError> for WalReceiverError {
 #[allow(clippy::too_many_arguments)]
 pub(super) async fn handle_walreceiver_connection(
    timeline: Arc<Timeline>,
+    protocol: PostgresClientProtocol,
    wal_source_connconf: PgConnectionConfig,
    events_sender: watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
    cancellation: CancellationToken,
@@ -260,6 +264,14 @@ pub(super) async fn handle_walreceiver_connection(

    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;

+    let interpreted_proto_config = match protocol {
+        PostgresClientProtocol::Vanilla => None,
+        PostgresClientProtocol::Interpreted {
+            format,
+            compression,
+        } => Some((format, compression)),
+    };
+
    while let Some(replication_message) = {
        select! {
            _ = cancellation.cancelled() => {
@@ -291,6 +303,15 @@ pub(super) async fn handle_walreceiver_connection(
                connection_status.latest_connection_update = now;
                connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end()));
            }
+            ReplicationMessage::RawInterpretedWalRecords(raw) => {
+                connection_status.latest_connection_update = now;
+                if !raw.data().is_empty() {
+                    connection_status.latest_wal_update = now;
+                }
+
+                connection_status.commit_lsn = Some(Lsn::from(raw.commit_lsn()));
+                connection_status.streaming_lsn = Some(Lsn::from(raw.streaming_lsn()));
+            }
            &_ => {}
        };
        if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
@@ -298,7 +319,148 @@ pub(super) async fn handle_walreceiver_connection(
            return Ok(());
        }

+        async fn commit(
+            modification: &mut DatadirModification<'_>,
+            uncommitted: &mut u64,
+            filtered: &mut u64,
+            ctx: &RequestContext,
+        ) -> anyhow::Result<()> {
+            WAL_INGEST
+                .records_committed
+                .inc_by(*uncommitted - *filtered);
+            modification.commit(ctx).await?;
+            *uncommitted = 0;
+            *filtered = 0;
+            Ok(())
+        }
+
        let status_update = match replication_message {
+            ReplicationMessage::RawInterpretedWalRecords(raw) => {
+                WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64);
+
+                let mut uncommitted_records = 0;
+                let mut filtered_records = 0;
+
+                // This is the end LSN of the raw WAL from which the records
+                // were interpreted.
+                let streaming_lsn = Lsn::from(raw.streaming_lsn());
+
+                let (format, compression) = interpreted_proto_config.unwrap();
+                let batch = InterpretedWalRecords::from_wire(raw.data(), format, compression)
+                    .await
+                    .with_context(|| {
+                        anyhow::anyhow!(
+                        "Failed to deserialize interpreted records ending at LSN {streaming_lsn}"
+                    )
+                    })?;
+
+                let InterpretedWalRecords {
+                    records,
+                    next_record_lsn,
+                } = batch;
+
+                tracing::debug!(
+                    "Received WAL up to {} with next_record_lsn={:?}",
+                    streaming_lsn,
+                    next_record_lsn
+                );
+
+                // We start the modification at 0 because each interpreted record
+                // advances it to its end LSN. 0 is just an initialization placeholder.
+                let mut modification = timeline.begin_modification(Lsn(0));
+
+                for interpreted in records {
+                    if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
+                        && uncommitted_records > 0
+                    {
+                        commit(
+                            &mut modification,
+                            &mut uncommitted_records,
+                            &mut filtered_records,
+                            &ctx,
+                        )
+                        .await?;
+                    }
+
+                    let local_next_record_lsn = interpreted.next_record_lsn;
+                    let ingested = walingest
+                        .ingest_record(interpreted, &mut modification, &ctx)
+                        .await
+                        .with_context(|| {
+                            format!("could not ingest record at {local_next_record_lsn}")
+                        })?;
+
+                    if !ingested {
+                        tracing::debug!(
+                            "ingest: filtered out record @ LSN {local_next_record_lsn}"
+                        );
+                        WAL_INGEST.records_filtered.inc();
+                        filtered_records += 1;
+                    }
+
+                    uncommitted_records += 1;
+
+                    // FIXME: this cannot be made pausable_failpoint without fixing the
+                    // failpoint library; in tests, the added amount of debugging will cause us
+                    // to timeout the tests.
+                    fail_point!("walreceiver-after-ingest");
+
+                    // Commit every ingest_batch_size records. Even if we filtered out
+                    // all records, we still need to call commit to advance the LSN.
+                    if uncommitted_records >= ingest_batch_size
+                        || modification.approx_pending_bytes()
+                            > DatadirModification::MAX_PENDING_BYTES
+                    {
+                        commit(
+                            &mut modification,
+                            &mut uncommitted_records,
+                            &mut filtered_records,
+                            &ctx,
+                        )
+                        .await?;
+                    }
+                }
+
+                // Records might have been filtered out on the safekeeper side, but we still
+                // need to advance last record LSN on all shards. If we've not ingested the latest
+                // record, then set the LSN of the modification past it. This way all shards
+                // advance their last record LSN at the same time.
+                let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) {
+                    Some(lsn) if lsn > modification.get_lsn() => {
+                        modification.set_lsn(lsn).unwrap();
+                        true
+                    }
+                    _ => false,
+                };
+
+                if uncommitted_records > 0 || needs_last_record_lsn_advance {
+                    // Commit any uncommitted records
+                    commit(
+                        &mut modification,
+                        &mut uncommitted_records,
+                        &mut filtered_records,
+                        &ctx,
+                    )
+                    .await?;
+                }
+
+                if !caught_up && streaming_lsn >= end_of_wal {
+                    info!("caught up at LSN {streaming_lsn}");
+                    caught_up = true;
+                }
+
+                tracing::debug!(
+                    "Ingested WAL up to {streaming_lsn}. Last record LSN is {}",
+                    timeline.get_last_record_lsn()
+                );
+
+                if let Some(lsn) = next_record_lsn {
+                    last_rec_lsn = lsn;
+                }
+
+                Some(streaming_lsn)
+            }
+
            ReplicationMessage::XLogData(xlog_data) => {
                // Pass the WAL data to the decoder, and see if we can decode
                // more records as a result.
@@ -316,21 +478,6 @@ pub(super) async fn handle_walreceiver_connection(
                    let mut uncommitted_records = 0;
                    let mut filtered_records = 0;

-                    async fn commit(
-                        modification: &mut DatadirModification<'_>,
-                        uncommitted: &mut u64,
-                        filtered: &mut u64,
-                        ctx: &RequestContext,
-                    ) -> anyhow::Result<()> {
-                        WAL_INGEST
-                            .records_committed
-                            .inc_by(*uncommitted - *filtered);
-                        modification.commit(ctx).await?;
-                        *uncommitted = 0;
-                        *filtered = 0;
-                        Ok(())
-                    }
-
                    while let Some((next_record_lsn, recdata)) = waldecoder.poll_decode()? {
                        // It is important to deal with the aligned records as lsn in getPage@LSN is
                        // aligned and can be several bytes bigger. Without this alignment we are
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -3,6 +3,7 @@ use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use std::collections::HashSet;
 use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;

@@ -14,7 +15,6 @@ use utils::lsn::AtomicLsn;
 use std::sync::atomic::AtomicU32;
 use utils::lsn::Lsn;

-#[cfg(feature = "testing")]
 use utils::generation::Generation;

 // clippy warns that Uninitialized is much smaller than Initialized, which wastes
@@ -38,6 +38,12 @@ impl UploadQueue {
    }
 }

+#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
+pub(crate) enum OpType {
+    MayReorder,
+    FlushDeletion,
+}
+
 /// This keeps track of queued and in-progress tasks.
 pub(crate) struct UploadQueueInitialized {
    /// Counter to assign task IDs
@@ -88,6 +94,9 @@ pub(crate) struct UploadQueueInitialized {
    #[cfg(feature = "testing")]
    pub(crate) dangling_files: HashMap<LayerName, Generation>,

+    /// Ensure we order file operations correctly.
+    pub(crate) recently_deleted: HashSet<(LayerName, Generation)>,
+
    /// Deletions that are blocked by the tenant configuration
    pub(crate) blocked_deletions: Vec<Delete>,

@@ -183,6 +192,7 @@ impl UploadQueue {
            queued_operations: VecDeque::new(),
            #[cfg(feature = "testing")]
            dangling_files: HashMap::new(),
+            recently_deleted: HashSet::new(),
            blocked_deletions: Vec::new(),
            shutting_down: false,
            shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
@@ -224,6 +234,7 @@ impl UploadQueue {
            queued_operations: VecDeque::new(),
            #[cfg(feature = "testing")]
            dangling_files: HashMap::new(),
+            recently_deleted: HashSet::new(),
            blocked_deletions: Vec::new(),
            shutting_down: false,
            shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
@@ -282,8 +293,8 @@ pub(crate) struct Delete {

 #[derive(Debug)]
 pub(crate) enum UploadOp {
-    /// Upload a layer file
-    UploadLayer(ResidentLayer, LayerFileMetadata),
+    /// Upload a layer file. The last field indicates the last operation for thie file.
+    UploadLayer(ResidentLayer, LayerFileMetadata, Option<OpType>),

    /// Upload a index_part.json file
    UploadMetadata {
@@ -305,11 +316,11 @@ pub(crate) enum UploadOp {
 impl std::fmt::Display for UploadOp {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self {
-            UploadOp::UploadLayer(layer, metadata) => {
+            UploadOp::UploadLayer(layer, metadata, mode) => {
                write!(
                    f,
-                    "UploadLayer({}, size={:?}, gen={:?})",
-                    layer, metadata.file_size, metadata.generation
+                    "UploadLayer({}, size={:?}, gen={:?}, mode={:?})",
+                    layer, metadata.file_size, metadata.generation, mode
                )
            }
            UploadOp::UploadMetadata { uploaded, .. } => {
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
@@ -19,7 +19,7 @@ impl<'a, const N: usize, const A: usize> AlignedSlice<'a, N, ConstAlign<A>> {
    }
 }

-impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> {
+impl<const N: usize, A: Alignment> Deref for AlignedSlice<'_, N, A> {
    type Target = [u8; N];

    fn deref(&self) -> &Self::Target {
@@ -27,13 +27,13 @@ impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> {
    }
 }

-impl<'a, const N: usize, A: Alignment> DerefMut for AlignedSlice<'a, N, A> {
+impl<const N: usize, A: Alignment> DerefMut for AlignedSlice<'_, N, A> {
    fn deref_mut(&mut self) -> &mut Self::Target {
        self.buf
    }
 }

-impl<'a, const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'a, N, A> {
+impl<const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'_, N, A> {
    fn as_ref(&self) -> &[u8; N] {
        self.buf
    }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -334,14 +334,32 @@ impl WalIngest {
        // replaying it would fail to find the previous image of the page, because
        // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
        // record if it doesn't.
-        let vm_size = get_relsize(modification, vm_rel, ctx).await?;
+        //
+        // TODO: analyze the metrics and tighten this up accordingly. This logic
+        // implicitly assumes that VM pages see explicit WAL writes before
+        // implicit ClearVmBits, and will otherwise silently drop updates.
+        let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
+            WAL_INGEST
+                .clear_vm_bits_unknown
+                .with_label_values(&["relation"])
+                .inc();
+            return Ok(());
+        };
        if let Some(blknum) = new_vm_blk {
            if blknum >= vm_size {
+                WAL_INGEST
+                    .clear_vm_bits_unknown
+                    .with_label_values(&["new_page"])
+                    .inc();
                new_vm_blk = None;
            }
        }
        if let Some(blknum) = old_vm_blk {
            if blknum >= vm_size {
+                WAL_INGEST
+                    .clear_vm_bits_unknown
+                    .with_label_values(&["old_page"])
+                    .inc();
                old_vm_blk = None;
            }
        }
@@ -572,7 +590,8 @@ impl WalIngest {
                modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
                fsm_physical_page_no += 1;
            }
-            let nblocks = get_relsize(modification, rel, ctx).await?;
+            // TODO: re-examine the None case here wrt. sharding; should we error?
+            let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
            if nblocks > fsm_physical_page_no {
                // check if something to do: FSM is larger than truncate position
                self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
@@ -612,7 +631,8 @@ impl WalIngest {
                )?;
                vm_page_no += 1;
            }
-            let nblocks = get_relsize(modification, rel, ctx).await?;
+            // TODO: re-examine the None case here wrt. sharding; should we error?
+            let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
            if nblocks > vm_page_no {
                // check if something to do: VM is larger than truncate position
                self.put_rel_truncation(modification, rel, vm_page_no, ctx)
@@ -1430,24 +1450,27 @@ impl WalIngest {
    }
 }

+/// Returns the size of the relation as of this modification, or None if the relation doesn't exist.
+///
+/// This is only accurate on shard 0. On other shards, it will return the size up to the highest
+/// page number stored in the shard, or None if the shard does not have any pages for it.
 async fn get_relsize(
    modification: &DatadirModification<'_>,
    rel: RelTag,
    ctx: &RequestContext,
-) -> Result<BlockNumber, PageReconstructError> {
-    let nblocks = if !modification
+) -> Result<Option<BlockNumber>, PageReconstructError> {
+    if !modification
        .tline
        .get_rel_exists(rel, Version::Modified(modification), ctx)
        .await?
    {
-        0
-    } else {
-        modification
-            .tline
-            .get_rel_size(rel, Version::Modified(modification), ctx)
-            .await?
-    };
-    Ok(nblocks)
+        return Ok(None);
+    }
+    modification
+        .tline
+        .get_rel_size(rel, Version::Modified(modification), ctx)
+        .await
+        .map(Some)
 }

 #[allow(clippy::bool_assert_comparison)]