Update pageserver/src/tenant/secondary/heatmap_uploader.rs

Co-authored-by: Christian Schwarz <christian@neon.tech>
Update pageserver/src/tenant/secondary/downloader.rs
2026-05-17 05:00:38 +00:00 · 2023-12-20 16:10:59 +00:00 · 2023-12-20 16:09:05 +00:00 · 2023-12-20 11:02:05 +00:00 · 2023-12-20 10:39:44 +00:00 · 2023-12-20 10:37:03 +00:00
22 changed files with 2337 additions and 504 deletions
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -485,6 +485,13 @@ impl PageServerNode {
        Ok(self.http_client.list_timelines(*tenant_id).await?)
    }

+    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
+        Ok(self
+            .http_client
+            .tenant_secondary_download(*tenant_id)
+            .await?)
+    }
+
    pub async fn timeline_create(
        &self,
        tenant_id: TenantId,
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -11,6 +11,7 @@ use crate::{
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
+use pageserver_api::shard::TenantShardId;
 use std::collections::HashMap;
 use std::time::Duration;
 use utils::{
@@ -40,9 +41,9 @@ async fn await_lsn(
    loop {
        let latest = match get_lsns(tenant_id, pageserver).await {
            Ok(l) => l,
-            Err(e) => {
+            Err(_e) => {
                println!(
-                    "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
+                    "🕑 Waiting for pageserver {} to activate...",
                    pageserver.conf.id
                );
                std::thread::sleep(Duration::from_millis(500));
@@ -89,7 +90,7 @@ pub async fn migrate_tenant(
    tenant_id: TenantId,
    dest_ps: PageServerNode,
 ) -> anyhow::Result<()> {
-    // Get a new generation
+    println!("🤔 Checking existing status...");
    let attachment_service = AttachmentService::from_env(env);

    fn build_location_config(
@@ -135,6 +136,20 @@ pub async fn migrate_tenant(
        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
    }

+    println!(
+        "🔁 Downloading latest layers to destination pageserver {}",
+        dest_ps.conf.id
+    );
+    match dest_ps
+        .tenant_secondary_download(&TenantShardId::unsharded(tenant_id))
+        .await
+    {
+        Ok(()) => {}
+        Err(_) => {
+            println!("  (skipping, destination wasn't in secondary mode)")
+        }
+    }
+
    let gen = attachment_service
        .attach_hook(tenant_id, dest_ps.conf.id)
        .await?;
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -15,6 +15,12 @@ pub struct Gate {
    name: String,
 }

+impl std::fmt::Debug for Gate {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Gate<{}>", self.name)
+    }
+}
+
 /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
 /// not complete.
 #[derive(Debug)]
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,4 +1,4 @@
-use pageserver_api::models::*;
+use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method};
 use utils::{
    http::error::HttpErrorBody,
@@ -162,6 +162,18 @@ impl Client {
        Ok(())
    }

+    pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{}/secondary/download",
+            self.mgmt_api_endpoint, tenant_id
+        );
+        self.request(Method::POST, &uri, ())
+            .await?
+            .error_for_status()
+            .map(|_| ())
+            .map_err(|e| Error::ApiError(format!("{}", e)))
+    }
+
    pub async fn location_config(
        &self,
        tenant_id: TenantId,
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -526,6 +526,7 @@ fn start_pageserver(
            conf,
            remote_storage.clone(),
            disk_usage_eviction_state.clone(),
+            tenant_manager.clone(),
            background_jobs_barrier.clone(),
        )?;
    }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -42,25 +42,27 @@
 //   reading these fields. We use the Debug impl for semi-structured logging, though.

 use std::{
+    collections::HashMap,
    sync::Arc,
    time::{Duration, SystemTime},
 };

 use anyhow::Context;
-use camino::Utf8Path;
+use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
 use utils::completion;
-use utils::serde_percent::Percent;
+use utils::{id::TimelineId, serde_percent::Percent};

 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
-        self,
+        mgr::TenantManager,
+        secondary::SecondaryTenant,
        storage_layer::{AsLayerDesc, EvictionError, Layer},
        Timeline,
    },
@@ -86,6 +88,7 @@ pub fn launch_disk_usage_global_eviction_task(
    conf: &'static PageServerConf,
    storage: GenericRemoteStorage,
    state: Arc<State>,
+    tenant_manager: Arc<TenantManager>,
    background_jobs_barrier: completion::Barrier,
 ) -> anyhow::Result<()> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
@@ -111,8 +114,7 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };

-            disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
-                .await;
+            disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
            Ok(())
        },
    );
@@ -125,7 +127,7 @@ async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
    storage: &GenericRemoteStorage,
-    tenants_dir: &Utf8Path,
+    tenant_manager: Arc<TenantManager>,
    cancel: CancellationToken,
 ) {
    scopeguard::defer! {
@@ -152,7 +154,7 @@ async fn disk_usage_eviction_task(
                state,
                task_config,
                storage,
-                tenants_dir,
+                &tenant_manager,
                &cancel,
            )
            .await;
@@ -187,12 +189,15 @@ async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
    storage: &GenericRemoteStorage,
-    tenants_dir: &Utf8Path,
+    tenant_manager: &Arc<TenantManager>,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
-    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
+    let tenants_dir = tenant_manager.get_conf().tenants_path();
+    let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
+    let res =
+        disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, tenant_manager, cancel)
+            .await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -202,7 +207,7 @@ async fn disk_usage_eviction_task_iteration(
                }
                IterationOutcome::Finished(outcome) => {
                    // Verify with statvfs whether we made any real progress
-                    let after = filesystem_level_usage::get(tenants_dir, task_config)
+                    let after = filesystem_level_usage::get(&tenants_dir, task_config)
                        // It's quite unlikely to hit the error here. Keep the code simple and bail out.
                        .context("get filesystem-level disk usage after evictions")?;

@@ -278,6 +283,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
    _storage: &GenericRemoteStorage,
    usage_pre: U,
+    tenant_manager: &Arc<TenantManager>,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
    // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
@@ -297,7 +303,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        "running disk usage based eviction due to pressure"
    );

-    let candidates = match collect_eviction_candidates(cancel).await? {
+    let candidates = match collect_eviction_candidates(tenant_manager, cancel).await? {
        EvictionCandidates::Cancelled => {
            return Ok(IterationOutcome::Cancelled);
        }
@@ -333,11 +339,14 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
+    let mut secondary_by_tenant: HashMap<TenantShardId, Vec<(TimelineId, Layer)>> = HashMap::new();
+
    let mut warned = None;
    let mut usage_planned = usage_pre;
    let mut evicted_amount = 0;

-    for (i, (partition, candidate)) in candidates.iter().enumerate() {
+    let mut attached_candidates = Vec::new();
+    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
        if !usage_planned.has_pressure() {
            debug!(
                no_candidates_evicted = i,
@@ -346,14 +355,23 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            break;
        }

-        if partition == &MinResidentSizePartition::Below && warned.is_none() {
+        if partition == MinResidentSizePartition::Below && warned.is_none() {
            warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
            warned = Some(usage_planned);
        }

        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
        evicted_amount += 1;
+
+        // Split off attached vs. secondary tenants' layers: these are handled differently later
+        if let EvictionCandidateSource::Secondary(ttid) = candidate.source {
+            let batch = secondary_by_tenant.entry(ttid.0).or_default();
+            batch.push((ttid.1, candidate.layer));
+        } else {
+            attached_candidates.push((partition, candidate));
+        }
    }
+    let candidates = attached_candidates;

    let usage_planned = match warned {
        Some(respecting_tenant_min_resident_size) => PlannedUsage {
@@ -367,7 +385,20 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    };
    debug!(?usage_planned, "usage planned");

-    // phase2: evict layers
+    // phase2 (secondary tenants): evict victims batched by tenant
+    for (tenant_shard_id, timeline_layers) in secondary_by_tenant {
+        // Q: Why do we go via TenantManager again rather than just deleting files, or keeping
+        // an Arc ref to the secondary state?
+        // A: It's because a given tenant's local storage **belongs** to whoever is currently
+        // live in the TenantManager.  We must avoid a race where we might plan an eviction
+        // for secondary, and then execute it when the tenant is actually in an attached state.
+        tenant_manager
+            .evict_tenant_layers(&tenant_shard_id, timeline_layers)
+            .instrument(tracing::info_span!("evict_tenant_layers", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))
+            .await;
+    }
+
+    // phase2 (attached tenants): evict layers

    let mut js = tokio::task::JoinSet::new();
    let limit = 1000;
@@ -417,13 +448,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            };

            js.spawn(async move {
-                let rtc = candidate.timeline.remote_client.as_ref().expect(
-                    "holding the witness, all timelines must have a remote timeline client",
-                );
                let file_size = candidate.layer.layer_desc().file_size;
                candidate
                    .layer
-                    .evict_and_wait(rtc)
+                    .evict_and_wait()
                    .await
                    .map(|()| file_size)
                    .map_err(|e| (file_size, e))
@@ -454,9 +482,18 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    }))
 }

+// An eviction candidate might originate from either an attached tenant
+// with a [`Tenant`] and [`Timeline`] object, or from a secondary tenant
+// location.  These differ in how we will execute the eviction.
+#[derive(Clone)]
+enum EvictionCandidateSource {
+    Attached(Arc<Timeline>),
+    Secondary((TenantShardId, TimelineId)),
+}
+
 #[derive(Clone)]
 struct EvictionCandidate {
-    timeline: Arc<Timeline>,
+    source: EvictionCandidateSource,
    layer: Layer,
    last_activity_ts: SystemTime,
 }
@@ -506,30 +543,25 @@ enum EvictionCandidates {
 /// after exhauting the `Above` partition.
 /// So, we did not respect each tenant's min_resident_size.
 async fn collect_eviction_candidates(
+    tenant_manager: &Arc<TenantManager>,
    cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
    // get a snapshot of the list of tenants
-    let tenants = tenant::mgr::list_tenants()
-        .await
-        .context("get list of tenants")?;
-
    let mut candidates = Vec::new();

-    for (tenant_id, _state) in &tenants {
+    let tenants = tenant_manager.get_attached_active_tenant_shards();
+
+    for tenant in tenants {
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
-            Ok(tenant) => tenant,
-            Err(e) => {
-                // this can happen if tenant has lifecycle transition after we fetched it
-                debug!("failed to get tenant: {e:#}");
-                continue;
-            }
-        };

        if tenant.cancel.is_cancelled() {
-            info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
+            info!(tenant_shard_id=%tenant.get_tenant_shard_id(), "Skipping tenant for eviction, it is shutting down");
+        }
+
+        if !tenant.is_active() {
+            debug!(tenant_shard_id=%tenant.get_tenant_shard_id(), "Ignoring non-active tenant for eviction");
            continue;
        }

@@ -594,7 +626,7 @@ async fn collect_eviction_candidates(
        for (timeline, layer_info) in tenant_candidates.into_iter() {
            let file_size = layer_info.file_size();
            let candidate = EvictionCandidate {
-                timeline,
+                source: EvictionCandidateSource::Attached(timeline),
                last_activity_ts: layer_info.last_activity_ts,
                layer: layer_info.layer,
            };
@@ -608,6 +640,60 @@ async fn collect_eviction_candidates(
        }
    }

+    // FIXME: this is a long loop over all secondary locations.  At the least, respect
+    // cancellation here, but really we need to break up the loop.  We could extract the
+    // Arc<SecondaryTenant>s and iterate over them with some tokio yields in there.  Ideally
+    // though we should just reduce the total amount of work: our eviction goals do not require
+    // listing absolutely every layer in every tenant: we could sample this.
+    let mut secondary_tenants = Vec::new();
+    tenant_manager.foreach_secondary_tenants(
+        |tenant_shard_id: &TenantShardId, state: &Arc<SecondaryTenant>| {
+            secondary_tenants.push((*tenant_shard_id, state.clone()));
+        },
+    );
+
+    for (tenant_shard_id, secondary_tenant) in secondary_tenants {
+        for (timeline_id, layer_info) in secondary_tenant
+            .get_layers_for_eviction(tenant_manager.get_conf(), tenant_shard_id)
+            .instrument(tracing::info_span!("get_layers_for_eviction", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))
+            .await
+        {
+            let mut tenant_candidates = Vec::new();
+            debug!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, "timeline resident layers (secondary) count: {}", layer_info.resident_layers.len());
+            tenant_candidates.extend(
+                layer_info
+                    .resident_layers
+                    .into_iter()
+                    .map(|layer_infos| (timeline_id, layer_infos)),
+            );
+
+            tenant_candidates.sort_unstable_by_key(|(_, layer_info)| {
+                std::cmp::Reverse(layer_info.last_activity_ts)
+            });
+
+            candidates.extend(
+                tenant_candidates
+                    .into_iter()
+                    .map(|(timeline_id, candidate)| {
+                        (
+                            // Secondary locations' layers are always considered above the min resident size,
+                            // i.e. secondary locations are permitted to be trimmed to zero layers if all
+                            // the layers have sufficiently old access times.
+                            MinResidentSizePartition::Above,
+                            EvictionCandidate {
+                                source: EvictionCandidateSource::Secondary((
+                                    tenant_shard_id,
+                                    timeline_id,
+                                )),
+                                last_activity_ts: candidate.last_activity_ts,
+                                layer: candidate.layer,
+                            },
+                        )
+                    }),
+            );
+        }
+    }
+
    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
    candidates
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1272,6 +1272,23 @@ async fn put_tenant_location_config_handler(
        // which is not a 400 but a 409.
        .map_err(ApiError::BadRequest)?;

+    if let Some(_flush_ms) = flush {
+        match state
+            .secondary_controller
+            .upload_tenant(tenant_shard_id)
+            .await
+        {
+            Ok(()) => {
+                tracing::info!("Uploaded heatmap during flush");
+            }
+            Err(e) => {
+                tracing::warn!("Failed to flush heatmap: {e}");
+            }
+        }
+    } else {
+        tracing::info!("No flush requested when configuring");
+    }
+
    json_response(StatusCode::OK, ())
 }

@@ -1601,10 +1618,14 @@ async fn disk_usage_eviction_run(
        )));
    };

-    let state = state.disk_usage_eviction_state.clone();
+    let eviction_state = state.disk_usage_eviction_state.clone();

    let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
-        &state, storage, usage, &cancel,
+        &eviction_state,
+        storage,
+        usage,
+        &state.tenant_manager,
+        &cancel,
    )
    .await;

@@ -1630,6 +1651,21 @@ async fn secondary_upload_handler(
    json_response(StatusCode::OK, ())
 }

+async fn secondary_download_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    state
+        .secondary_controller
+        .download_tenant(tenant_shard_id)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(
        StatusCode::NOT_FOUND,
@@ -1898,6 +1934,9 @@ pub fn make_router(
        .put("/v1/deletion_queue/flush", |r| {
            api_handler(r, deletion_queue_flush)
        })
+        .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
+            api_handler(r, secondary_download_handler)
+        })
        .put("/v1/tenant/:tenant_shard_id/break", |r| {
            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
        })
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1369,6 +1369,8 @@ pub(crate) struct SecondaryModeMetrics {
    pub(crate) upload_heatmap: IntCounter,
    pub(crate) upload_heatmap_errors: IntCounter,
    pub(crate) upload_heatmap_duration: Histogram,
+    pub(crate) download_heatmap: IntCounter,
+    pub(crate) download_layer: IntCounter,
 }
 pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
    upload_heatmap: register_int_counter!(
@@ -1386,6 +1388,16 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
        "Time to build and upload a heatmap, including any waiting inside the S3 client"
    )
    .expect("failed to define a metric"),
+    download_heatmap: register_int_counter!(
+        "pageserver_secondary_download_heatmap",
+        "Number of downloads of heatmaps by secondary mode locations"
+    )
+    .expect("failed to define a metric"),
+    download_layer: register_int_counter!(
+        "pageserver_secondary_download_layer",
+        "Number of downloads of layers by secondary mode locations"
+    )
+    .expect("failed to define a metric"),
 });

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -258,6 +258,9 @@ pub enum TaskKind {
    /// See [`crate::disk_usage_eviction_task`].
    DiskUsageEviction,

+    /// See [`crate::tenant::secondary`].
+    SecondaryDownloads,
+
    /// See [`crate::tenant::secondary`].
    SecondaryUploads,

--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -585,7 +585,7 @@ impl DeleteTenantFlow {
                            }
                            break;
                        }
-                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
+                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
                            // This is unexpected: this secondary tenants should not have been created, and we
                            // are not in a position to shut it down from here.
                            tracing::warn!("Tenant transitioned to secondary mode while deleting!");
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -44,6 +44,8 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};

 use super::delete::DeleteTenantError;
+use super::secondary::SecondaryTenant;
+use super::storage_layer::Layer;
 use super::TenantSharedResources;

 /// For a tenant that appears in TenantsMap, it may either be
@@ -57,7 +59,7 @@ use super::TenantSharedResources;
 /// having a properly acquired generation (Secondary doesn't need a generation)
 pub(crate) enum TenantSlot {
    Attached(Arc<Tenant>),
-    Secondary,
+    Secondary(Arc<SecondaryTenant>),
    /// In this state, other administrative operations acting on the TenantId should
    /// block, or return a retry indicator equivalent to HTTP 503.
    InProgress(utils::completion::Barrier),
@@ -67,7 +69,7 @@ impl std::fmt::Debug for TenantSlot {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()),
-            Self::Secondary => write!(f, "Secondary"),
+            Self::Secondary(_) => write!(f, "Secondary"),
            Self::InProgress(_) => write!(f, "InProgress"),
        }
    }
@@ -78,7 +80,7 @@ impl TenantSlot {
    fn get_attached(&self) -> Option<&Arc<Tenant>> {
        match self {
            Self::Attached(t) => Some(t),
-            Self::Secondary => None,
+            Self::Secondary(_) => None,
            Self::InProgress(_) => None,
        }
    }
@@ -464,12 +466,18 @@ pub async fn init_tenant_mgr(
                *gen
            } else {
                match &location_conf.mode {
-                    LocationMode::Secondary(_) => {
+                    LocationMode::Secondary(secondary_config) => {
                        // We do not require the control plane's permission for secondary mode
                        // tenants, because they do no remote writes and hence require no
                        // generation number
                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
-                        tenants.insert(tenant_shard_id, TenantSlot::Secondary);
+                        tenants.insert(
+                            tenant_shard_id,
+                            TenantSlot::Secondary(SecondaryTenant::new(
+                                tenant_shard_id,
+                                secondary_config,
+                            )),
+                        );
                    }
                    LocationMode::Attached(_) => {
                        // TODO: augment re-attach API to enable the control plane to
@@ -664,8 +672,14 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {

                            total_attached += 1;
                        }
-                        TenantSlot::Secondary => {
-                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary);
+                        TenantSlot::Secondary(state) => {
+                            // We don't need to wait for this individually per-tenant: the
+                            // downloader task will be waited on eventually, this cancel
+                            // is just to encourage it to drop out if it is doing work
+                            // for this tenant right now.
+                            state.cancel.cancel();
+
+                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state));
                        }
                        TenantSlot::InProgress(notify) => {
                            // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
@@ -848,12 +862,28 @@ impl TenantManager {
            Some(TenantSlot::InProgress(_)) => {
                Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
            }
-            None | Some(TenantSlot::Secondary) => {
+            None | Some(TenantSlot::Secondary(_)) => {
                Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
            }
        }
    }

+    pub(crate) fn get_secondary_tenant_shard(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Option<Arc<SecondaryTenant>> {
+        let locked = self.tenants.read().unwrap();
+
+        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
+            .ok()
+            .flatten();
+
+        match peek_slot {
+            Some(TenantSlot::Secondary(s)) => Some(s.clone()),
+            _ => None,
+        }
+    }
+
    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(crate) async fn upsert_location(
        &self,
@@ -865,10 +895,15 @@ impl TenantManager {
        debug_assert_current_span_has_tenant_id();
        info!("configuring tenant location to state {new_location_config:?}");

+        enum FastPathModified {
+            Attached(Arc<Tenant>),
+            Secondary(Arc<SecondaryTenant>),
+        }
+
        // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
        // then we do not need to set the slot to InProgress, we can just call into the
        // existng tenant.
-        let modify_tenant = {
+        let fast_path_taken = {
            let locked = self.tenants.read().unwrap();
            let peek_slot =
                tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
@@ -882,12 +917,19 @@ impl TenantManager {
                            new_location_config.clone(),
                        )?);

-                        Some(tenant.clone())
+                        Some(FastPathModified::Attached(tenant.clone()))
                    } else {
                        // Different generations, fall through to general case
                        None
                    }
                }
+                (
+                    LocationMode::Secondary(secondary_conf),
+                    Some(TenantSlot::Secondary(secondary_tenant)),
+                ) => {
+                    secondary_tenant.set_config(secondary_conf);
+                    Some(FastPathModified::Secondary(secondary_tenant.clone()))
+                }
                _ => {
                    // Not an Attached->Attached transition, fall through to general case
                    None
@@ -896,34 +938,46 @@ impl TenantManager {
        };

        // Fast-path continued: having dropped out of the self.tenants lock, do the async
-        // phase of waiting for flush, before returning.
-        if let Some(tenant) = modify_tenant {
-            // Transition to AttachedStale means we may well hold a valid generation
-            // still, and have been requested to go stale as part of a migration.  If
-            // the caller set `flush`, then flush to remote storage.
-            if let LocationMode::Attached(AttachedLocationConfig {
-                generation: _,
-                attach_mode: AttachmentMode::Stale,
-            }) = &new_location_config.mode
-            {
-                if let Some(flush_timeout) = flush {
-                    match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
-                        Ok(Err(e)) => {
-                            return Err(e);
-                        }
-                        Ok(Ok(_)) => return Ok(()),
-                        Err(_) => {
-                            tracing::warn!(
+        // phase of writing config and/or waiting for flush, before returning.
+        match fast_path_taken {
+            Some(FastPathModified::Attached(tenant)) => {
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+
+                // Transition to AttachedStale means we may well hold a valid generation
+                // still, and have been requested to go stale as part of a migration.  If
+                // the caller set `flush`, then flush to remote storage.
+                if let LocationMode::Attached(AttachedLocationConfig {
+                    generation: _,
+                    attach_mode: AttachmentMode::Stale,
+                }) = &new_location_config.mode
+                {
+                    if let Some(flush_timeout) = flush {
+                        match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
+                            Ok(Err(e)) => {
+                                return Err(e);
+                            }
+                            Ok(Ok(_)) => return Ok(()),
+                            Err(_) => {
+                                tracing::warn!(
                                timeout_ms = flush_timeout.as_millis(),
                                "Timed out waiting for flush to remote storage, proceeding anyway."
                            )
+                            }
                        }
                    }
                }
-            }

-            return Ok(());
-        }
+                return Ok(());
+            }
+            Some(FastPathModified::Secondary(_secondary_tenant)) => {
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+            }
+            None => {}
+        };

        // General case for upserts to TenantsMap, excluding the case above: we will substitute an
        // InProgress value to the slot while we make whatever changes are required.  The state for
@@ -932,54 +986,67 @@ impl TenantManager {
        // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;

-        if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
-            // The case where we keep a Tenant alive was covered above in the special case
-            // for Attached->Attached transitions in the same generation.  By this point,
-            // if we see an attached tenant we know it will be discarded and should be
-            // shut down.
-            let (_guard, progress) = utils::completion::channel();
+        match slot_guard.get_old_value() {
+            Some(TenantSlot::Attached(tenant)) => {
+                // The case where we keep a Tenant alive was covered above in the special case
+                // for Attached->Attached transitions in the same generation.  By this point,
+                // if we see an attached tenant we know it will be discarded and should be
+                // shut down.
+                let (_guard, progress) = utils::completion::channel();

-            match tenant.get_attach_mode() {
-                AttachmentMode::Single | AttachmentMode::Multi => {
-                    // Before we leave our state as the presumed holder of the latest generation,
-                    // flush any outstanding deletions to reduce the risk of leaking objects.
-                    self.resources.deletion_queue_client.flush_advisory()
-                }
-                AttachmentMode::Stale => {
-                    // If we're stale there's not point trying to flush deletions
-                }
-            };
+                match tenant.get_attach_mode() {
+                    AttachmentMode::Single | AttachmentMode::Multi => {
+                        // Before we leave our state as the presumed holder of the latest generation,
+                        // flush any outstanding deletions to reduce the risk of leaking objects.
+                        self.resources.deletion_queue_client.flush_advisory()
+                    }
+                    AttachmentMode::Stale => {
+                        // If we're stale there's not point trying to flush deletions
+                    }
+                };

-            info!("Shutting down attached tenant");
-            match tenant.shutdown(progress, false).await {
-                Ok(()) => {}
-                Err(barrier) => {
-                    info!("Shutdown already in progress, waiting for it to complete");
-                    barrier.wait().await;
+                info!("Shutting down attached tenant");
+                match tenant.shutdown(progress, false).await {
+                    Ok(()) => {}
+                    Err(barrier) => {
+                        info!("Shutdown already in progress, waiting for it to complete");
+                        barrier.wait().await;
+                    }
                }
+                slot_guard.drop_old_value().expect("We just shut it down");
+            }
+            Some(TenantSlot::Secondary(state)) => {
+                info!("Shutting down secondary tenant");
+                state.shutdown().await;
+            }
+            Some(TenantSlot::InProgress(_)) => {
+                // This should never happen: acquire_slot should error out
+                // if the contents of a slot were InProgress.
+                anyhow::bail!("Acquired an InProgress slot, this is a bug.")
+            }
+            None => {
+                // Slot was vacant, nothing needs shutting down.
            }
-            slot_guard.drop_old_value().expect("We just shut it down");
        }

        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
+        let timelines_path = self.conf.timelines_path(&tenant_shard_id);

        let new_slot = match &new_location_config.mode {
-            LocationMode::Secondary(_) => {
+            LocationMode::Secondary(secondary_config) => {
                // Directory doesn't need to be fsync'd because if we crash it can
                // safely be recreated next time this tenant location is configured.
-                tokio::fs::create_dir_all(&tenant_path)
+                tokio::fs::create_dir_all(&timelines_path)
                    .await
-                    .with_context(|| format!("Creating {tenant_path}"))?;
+                    .with_context(|| format!("Creating {timelines_path}"))?;

                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
                    .await
                    .map_err(SetNewTenantConfigError::Persist)?;

-                TenantSlot::Secondary
+                TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config))
            }
            LocationMode::Attached(_attach_config) => {
-                let timelines_path = self.conf.timelines_path(&tenant_shard_id);
-
                // Directory doesn't need to be fsync'd because we do not depend on
                // it to exist after crashes: it may be recreated when tenant is
                // re-attached, see https://github.com/neondatabase/neon/issues/5550
@@ -1102,6 +1169,80 @@ impl TenantManager {
                .collect(),
        }
    }
+    // Do some synchronous work for all tenant slots in Secondary state.  The provided
+    // callback should be small and fast, as it will be called inside the global
+    // TenantsMap lock.
+    pub(crate) fn foreach_secondary_tenants<F>(&self, mut func: F)
+    where
+        // TODO: let the callback return a hint to drop out of the loop early
+        F: FnMut(&TenantShardId, &Arc<SecondaryTenant>),
+    {
+        let locked = self.tenants.read().unwrap();
+
+        let map = match &*locked {
+            TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return,
+            TenantsMap::Open(m) => m,
+        };
+
+        for (tenant_id, slot) in map {
+            if let TenantSlot::Secondary(state) = slot {
+                // Only expose secondary tenants that are not currently shutting down
+                if !state.cancel.is_cancelled() {
+                    func(tenant_id, state)
+                }
+            }
+        }
+    }
+
+    /// Having planned some evictions for a tenant, attempt to execute them.
+    ///
+    /// Execution will not occur if the TenantSlot for this tenant is not in
+    /// a state suitable to execute.
+    // TODO: is Layer really needed here?  Maybe we should have reduced to a LayerFileName by this point.
+    pub(crate) async fn evict_tenant_layers(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_layers: Vec<(TimelineId, Layer)>,
+    ) {
+        // TODO: unify with how we evict for attached tenants.  They should also
+        // pass through here, to avoid attached tenant evictions racing with
+        // the lifetime of secondary locations for the same tenant ID.
+
+        let state = {
+            let locked = self.tenants.read().unwrap();
+            let map = match &*locked {
+                TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return,
+                TenantsMap::Open(m) => m,
+            };
+
+            match map.get(tenant_shard_id) {
+                Some(TenantSlot::Secondary(secondary_state)) => {
+                    // Found a secondary as expected
+                    secondary_state.clone()
+                }
+                _ => {
+                    // A location configuration change raced with this eviction
+                    tracing::info!(
+                        "Dropping {} layer evictions, tenant not in suitable state",
+                        timeline_layers.len()
+                    );
+                    return;
+                }
+            }
+        };
+
+        // Concurrency: downloads might have been going on while we deleted layers.  However,
+        // we are only deleting layers that the SecondaryTenant already thought were on disk,
+        // so we won't be deleting anything that it is _currently_ downloading.  All deletions
+        // of SecondaryTenant layers flow through this function, so there is no risk that the
+        // layer we're evicting is no longer present in-memory.
+        state
+            .evict_layers(self.conf, tenant_shard_id, timeline_layers)
+            .instrument(tracing::info_span!("evict_layers",
+                tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()
+            ))
+            .await;
+    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1151,7 +1292,7 @@ pub(crate) fn get_tenant(
        Some(TenantSlot::InProgress(_)) => {
            Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
        }
-        None | Some(TenantSlot::Secondary) => {
+        None | Some(TenantSlot::Secondary(_)) => {
            Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
        }
    }
@@ -1222,7 +1363,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                    }
                }
            }
-            Some(TenantSlot::Secondary) => {
+            Some(TenantSlot::Secondary(_)) => {
                return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
                    tenant_id,
                )))
@@ -1521,7 +1662,7 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
    Ok(m.iter()
        .filter_map(|(id, tenant)| match tenant {
            TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
-            TenantSlot::Secondary => None,
+            TenantSlot::Secondary(_) => None,
            TenantSlot::InProgress(_) => None,
        })
        .collect())
@@ -1778,11 +1919,7 @@ impl SlotGuard {
    fn old_value_is_shutdown(&self) -> bool {
        match self.old_value.as_ref() {
            Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
-            Some(TenantSlot::Secondary) => {
-                // TODO: when adding secondary mode tenants, this will check for shutdown
-                // in the same way that we do for `Tenant` above
-                true
-            }
+            Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(),
            Some(TenantSlot::InProgress(_)) => {
                // A SlotGuard cannot be constructed for a slot that was already InProgress
                unreachable!()
@@ -1992,26 +2129,19 @@ where
    let mut slot_guard =
        tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;

-    // The SlotGuard allows us to manipulate the Tenant object without fear of some
-    // concurrent API request doing something else for the same tenant ID.
-    let attached_tenant = match slot_guard.get_old_value() {
-        Some(TenantSlot::Attached(t)) => Some(t),
-        _ => None,
-    };
-
    // allow pageserver shutdown to await for our completion
    let (_guard, progress) = completion::channel();

-    // If the tenant was attached, shut it down gracefully.  For secondary
-    // locations this part is not necessary
-    match &attached_tenant {
-        Some(attached_tenant) => {
+    // The SlotGuard allows us to manipulate the Tenant object without fear of some
+    // concurrent API request doing something else for the same tenant ID.
+    let attached_tenant = match slot_guard.get_old_value() {
+        Some(TenantSlot::Attached(tenant)) => {
            // whenever we remove a tenant from memory, we don't want to flush and wait for upload
            let freeze_and_flush = false;

            // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
            // that we can continue safely to cleanup.
-            match attached_tenant.shutdown(progress, freeze_and_flush).await {
+            match tenant.shutdown(progress, freeze_and_flush).await {
                Ok(()) => {}
                Err(_other) => {
                    // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
@@ -2020,11 +2150,19 @@ where
                    return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
                }
            }
+            Some(tenant)
        }
-        None => {
-            // Nothing to wait on when not attached, proceed.
+        Some(TenantSlot::Secondary(secondary_state)) => {
+            tracing::info!("Shutting down in secondary mode");
+            secondary_state.shutdown().await;
+            None
        }
-    }
+        Some(TenantSlot::InProgress(_)) => {
+            // Acquiring a slot guarantees its old value was not InProgress
+            unreachable!();
+        }
+        None => None,
+    };

    match tenant_cleanup
        .await
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -1,24 +1,58 @@
+mod downloader;
 pub mod heatmap;
 mod heatmap_uploader;
+mod scheduler;

-use std::sync::Arc;
+use std::{sync::Arc, time::SystemTime};

-use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
+use crate::{
+    config::PageServerConf,
+    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    tenant::span::debug_assert_current_span_has_tenant_id,
+};

-use self::heatmap_uploader::heatmap_uploader_task;
+use self::{
+    downloader::{downloader_task, SecondaryDetail},
+    heatmap_uploader::heatmap_uploader_task,
+    scheduler::TenantScoped,
+};

-use super::mgr::TenantManager;
+use super::{
+    config::SecondaryLocationConfig,
+    mgr::TenantManager,
+    storage_layer::{AsLayerDesc, Layer},
+    timeline::DiskUsageEvictionInfo,
+};

 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;

 use tokio_util::sync::CancellationToken;
-use utils::completion::Barrier;
+use utils::{completion::Barrier, fs_ext, id::TimelineId, sync::gate::Gate};

+enum DownloadCommand {
+    Download(TenantShardId),
+}
 enum UploadCommand {
    Upload(TenantShardId),
 }

+impl TenantScoped for UploadCommand {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        match self {
+            Self::Upload(id) => id,
+        }
+    }
+}
+
+impl TenantScoped for DownloadCommand {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        match self {
+            Self::Download(id) => id,
+        }
+    }
+}
+
 struct CommandRequest<T> {
    payload: T,
    response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
@@ -28,12 +62,145 @@ struct CommandResponse {
    result: anyhow::Result<()>,
 }

+// Whereas [`Tenant`] represents an attached tenant, this type represents the work
+// we do for secondary tenant locations: where we are not serving clients or
+// ingesting WAL, but we are maintaining a warm cache of layer files.
+//
+// This type is all about the _download_ path for secondary mode.  The upload path
+// runs separately (see [`heatmap_uploader`]) while a regular attached `Tenant` exists.
+//
+// This structure coordinates TenantManager and SecondaryDownloader,
+// so that the downloader can indicate which tenants it is currently
+// operating on, and the manager can indicate when a particular
+// secondary tenant should cancel any work in flight.
+#[derive(Debug)]
+pub(crate) struct SecondaryTenant {
+    /// Carrying a tenant shard ID simplifies callers such as the downloader
+    /// which need to organize many of these objects by ID.
+    tenant_shard_id: TenantShardId,
+
+    /// Cancellation token indicates to SecondaryDownloader that it should stop doing
+    /// any work for this tenant at the next opportunity.
+    pub(crate) cancel: CancellationToken,
+
+    pub(crate) gate: Gate,
+
+    detail: std::sync::Mutex<SecondaryDetail>,
+}
+
+impl SecondaryTenant {
+    pub(crate) fn new(
+        tenant_shard_id: TenantShardId,
+        config: &SecondaryLocationConfig,
+    ) -> Arc<Self> {
+        Arc::new(Self {
+            tenant_shard_id,
+            // todo: shall we make this a descendent of the
+            // main cancellation token, or is it sufficient that
+            // on shutdown we walk the tenants and fire their
+            // individual cancellations?
+            cancel: CancellationToken::new(),
+            gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
+
+            detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
+        })
+    }
+
+    pub(crate) async fn shutdown(&self) {
+        self.cancel.cancel();
+
+        // Wait for any secondary downloader work to complete
+        self.gate.close().await;
+    }
+
+    pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
+        self.detail.lock().unwrap().config = config.clone();
+    }
+
+    pub(crate) async fn get_layers_for_eviction(
+        &self,
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+    ) -> Vec<(TimelineId, DiskUsageEvictionInfo)> {
+        debug_assert_current_span_has_tenant_id();
+        {
+            let detail = self.detail.lock().unwrap();
+            if !detail.is_uninit() {
+                return detail.get_layers_for_eviction();
+            } else {
+                // In case we didn't freshen yet in this process lifetime, we will need to scan local storage
+                // to find all our layers.
+            }
+        }
+
+        tracing::debug!("Scanning local layers for secondary tenant to service eviction",);
+
+        // Fall through: we need to initialize Detail
+        let timelines = SecondaryDetail::init_detail(conf, tenant_shard_id).await;
+        let mut detail = self.detail.lock().unwrap();
+        if detail.is_uninit() {
+            detail.timelines = timelines;
+        }
+        detail.get_layers_for_eviction()
+    }
+
+    pub(crate) async fn evict_layers(
+        &self,
+        conf: &PageServerConf,
+        tenant_shard_id: &TenantShardId,
+        layers: Vec<(TimelineId, Layer)>,
+    ) {
+        debug_assert_current_span_has_tenant_id();
+        let _guard = match self.gate.enter() {
+            Ok(g) => g,
+            Err(_) => {
+                tracing::info!(
+                    "Dropping {} layer evictions, secondary tenant shutting down",
+                    layers.len()
+                );
+                return;
+            }
+        };
+
+        let now = SystemTime::now();
+
+        for (timeline_id, layer) in layers {
+            let layer_name = layer.layer_desc().filename();
+            let path = conf
+                .timeline_path(tenant_shard_id, &timeline_id)
+                .join(&layer_name.file_name());
+
+            // We tolerate ENOENT, because between planning eviction and executing
+            // it, the secondary downloader could have seen an updated heatmap that
+            // resulted in a layer being deleted.
+            tokio::fs::remove_file(path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+                .expect("TODO: terminate process on local I/O errors");
+
+            // TODO: batch up updates instead of acquiring lock in inner loop
+            let mut detail = self.detail.lock().unwrap();
+            // If there is no timeline detail for what we just deleted, that indicates that
+            // the secondary downloader did some work (perhaps removing all)
+            if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
+                timeline_detail.on_disk_layers.remove(&layer_name);
+                timeline_detail.evicted_at.insert(layer_name, now);
+            }
+        }
+    }
+
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        &self.tenant_shard_id
+    }
+}
+
 /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
 /// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
 /// where we want to immediately upload/download for a particular tenant.  In normal operation
 /// uploads & downloads are autonomous and not driven by this interface.
 pub struct SecondaryController {
    upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
+    download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
 }

 impl SecondaryController {
@@ -63,6 +230,13 @@ impl SecondaryController {
        self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
            .await
    }
+    pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+        self.dispatch(
+            &self.download_req_tx,
+            DownloadCommand::Download(tenant_shard_id),
+        )
+        .await
+    }
 }

 pub fn spawn_tasks(
@@ -71,9 +245,37 @@ pub fn spawn_tasks(
    background_jobs_can_start: Barrier,
    cancel: CancellationToken,
 ) -> SecondaryController {
+    let mgr_clone = tenant_manager.clone();
+    let storage_clone = remote_storage.clone();
+    let cancel_clone = cancel.clone();
+    let bg_jobs_clone = background_jobs_can_start.clone();
+
+    let (download_req_tx, download_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
    let (upload_req_tx, upload_req_rx) =
        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);

+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::SecondaryDownloads,
+        None,
+        None,
+        "secondary tenant downloads",
+        false,
+        async move {
+            downloader_task(
+                mgr_clone,
+                storage_clone,
+                download_req_rx,
+                bg_jobs_clone,
+                cancel_clone,
+            )
+            .await;
+
+            Ok(())
+        },
+    );
+
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::SecondaryUploads,
@@ -89,16 +291,26 @@ pub fn spawn_tasks(
                background_jobs_can_start,
                cancel,
            )
-            .await
+            .await;
+
+            Ok(())
        },
    );

-    SecondaryController { upload_req_tx }
+    SecondaryController {
+        download_req_tx,
+        upload_req_tx,
+    }
 }

 /// For running with remote storage disabled: a SecondaryController that is connected to nothing.
 pub fn null_controller() -> SecondaryController {
+    let (download_req_tx, _download_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
    let (upload_req_tx, _upload_req_rx) =
        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
-    SecondaryController { upload_req_tx }
+    SecondaryController {
+        upload_req_tx,
+        download_req_tx,
+    }
 }
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -0,0 +1,791 @@
+use std::{
+    collections::{HashMap, HashSet},
+    str::FromStr,
+    sync::Arc,
+    time::{Duration, Instant, SystemTime},
+};
+
+use crate::{
+    config::PageServerConf,
+    metrics::SECONDARY_MODE,
+    tenant::{
+        config::SecondaryLocationConfig,
+        debug_assert_current_span_has_tenant_and_timeline_id,
+        remote_timeline_client::{index::LayerFileMetadata, HEATMAP_BASENAME},
+        span::debug_assert_current_span_has_tenant_id,
+        storage_layer::{Layer, LayerFileName},
+        tasks::{warn_when_period_overrun, BackgroundLoopKind},
+        timeline::{DiskUsageEvictionInfo, LocalLayerInfoForDiskUsageEviction},
+    },
+    virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
+    METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
+};
+
+use super::{
+    scheduler::{HasBarrier, JobGenerator, SchedulingResult, TenantBackgroundJobs, TenantScoped},
+    SecondaryTenant,
+};
+use crate::tenant::{
+    mgr::TenantManager,
+    remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
+};
+use anyhow::Context;
+
+use chrono::format::{DelayedFormat, StrftimeItems};
+use pageserver_api::shard::TenantShardId;
+use rand::Rng;
+use remote_storage::GenericRemoteStorage;
+
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use tracing::{info_span, instrument, Instrument};
+use utils::{completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId};
+
+use super::{
+    heatmap::{HeatMapTenant, HeatMapTimeline},
+    CommandRequest, DownloadCommand,
+};
+
+/// For each tenant, how long must have passed since the last download_tenant call before
+/// calling it again.  This is approximately the time by which local data is allowed
+/// to fall behind remote data.
+///
+/// TODO: this should just be a default, and the actual period should be controlled
+/// via the heatmap itself
+/// `<ttps://github.com/neondatabase/neon/issues/6200>`
+const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
+
+#[derive(Debug, Clone)]
+pub(super) struct OnDiskState {
+    layer: Layer,
+    access_time: SystemTime,
+}
+
+impl OnDiskState {
+    fn new(
+        conf: &'static PageServerConf,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        name: LayerFileName,
+        metadata: LayerFileMetadata,
+        access_time: SystemTime,
+    ) -> Self {
+        Self {
+            layer: Layer::for_secondary(conf, tenant_shard_id, timeline_id, name, metadata),
+            access_time,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
+pub(super) struct SecondaryDetailTimeline {
+    pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
+
+    /// We remember when layers were evicted, to prevent re-downloading them.
+    pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
+}
+
+/// This state is written by the secondary downloader, it is opaque
+/// to TenantManager
+#[derive(Debug)]
+pub(super) struct SecondaryDetail {
+    pub(super) config: SecondaryLocationConfig,
+
+    last_download: Option<Instant>,
+    next_download: Option<Instant>,
+    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
+}
+
+/// Helper for logging SystemTime
+fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
+    let datetime: chrono::DateTime<chrono::Utc> = (*t).into();
+    datetime.format("%d/%m/%Y %T")
+}
+
+impl SecondaryDetail {
+    pub(super) fn new(config: SecondaryLocationConfig) -> Self {
+        Self {
+            config,
+            last_download: None,
+            next_download: None,
+            timelines: HashMap::new(),
+        }
+    }
+
+    pub(super) fn is_uninit(&self) -> bool {
+        // FIXME: empty timelines is not synonymous with not initialized, as it is legal for
+        // a tenant to exist with no timelines.
+        self.timelines.is_empty()
+    }
+
+    pub(super) async fn init_detail(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+    ) -> HashMap<TimelineId, SecondaryDetailTimeline> {
+        tracing::info!("init_detail");
+        // Load heatmap from local storage
+        let heatmap_path = conf.tenant_path(&tenant_shard_id).join(HEATMAP_BASENAME);
+        let heatmap = match tokio::fs::read(&heatmap_path).await {
+            Ok(bytes) => serde_json::from_slice::<HeatMapTenant>(&bytes).unwrap(),
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    return HashMap::new();
+                } else {
+                    on_fatal_io_error(&e, &format!("Reading heatmap file from {heatmap_path}"))
+                }
+            }
+        };
+
+        let mut timelines = HashMap::new();
+
+        for heatmap_timeline in heatmap.timelines {
+            let detail = init_timeline_state(conf, &tenant_shard_id, &heatmap_timeline).await;
+            timelines.insert(heatmap_timeline.timeline_id, detail);
+        }
+
+        timelines
+    }
+
+    pub(super) fn get_layers_for_eviction(&self) -> Vec<(TimelineId, DiskUsageEvictionInfo)> {
+        let mut result = Vec::new();
+        for (timeline_id, timeline_detail) in &self.timelines {
+            let layers: Vec<_> = timeline_detail
+                .on_disk_layers
+                .values()
+                .map(|ods| LocalLayerInfoForDiskUsageEviction {
+                    layer: ods.layer.clone(),
+                    last_activity_ts: ods.access_time,
+                })
+                .collect();
+
+            let max_layer_size = layers.iter().map(|l| l.layer.metadata().file_size()).max();
+
+            result.push((
+                *timeline_id,
+                DiskUsageEvictionInfo {
+                    resident_layers: layers,
+                    max_layer_size,
+                },
+            ))
+        }
+
+        tracing::debug!(
+            "Found {} timelines, {} layers",
+            self.timelines.len(),
+            result.len()
+        );
+
+        result
+    }
+}
+
+struct SecondaryDownloader {
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+}
+
+struct PendingDownload {
+    secondary_state: Arc<SecondaryTenant>,
+    last_download: Option<Instant>,
+    target_time: Option<Instant>,
+    period: Option<Duration>,
+}
+
+impl TenantScoped for PendingDownload {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.secondary_state.get_tenant_shard_id()
+    }
+}
+
+struct RunningDownload {
+    barrier: Barrier,
+}
+
+impl HasBarrier for RunningDownload {
+    fn get_barrier(&self) -> Barrier {
+        self.barrier.clone()
+    }
+}
+
+struct CompleteDownload {
+    secondary_state: Arc<SecondaryTenant>,
+    completed_at: Instant,
+}
+
+impl TenantScoped for CompleteDownload {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.secondary_state.get_tenant_shard_id()
+    }
+}
+
+impl TenantScoped for SecondaryTenant {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.get_tenant_shard_id()
+    }
+}
+
+type Scheduler = TenantBackgroundJobs<
+    SecondaryDownloader,
+    PendingDownload,
+    RunningDownload,
+    CompleteDownload,
+    DownloadCommand,
+>;
+
+#[async_trait::async_trait]
+impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCommand>
+    for SecondaryDownloader
+{
+    #[instrument(skip_all, fields(tenant_id=%completion.get_tenant_shard_id().tenant_id, shard_id=%completion.get_tenant_shard_id().shard_slug()))]
+    fn on_completion(&mut self, completion: CompleteDownload) {
+        let CompleteDownload {
+            secondary_state,
+            completed_at: _completed_at,
+        } = completion;
+
+        tracing::debug!("Secondary tenant download completed");
+
+        // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
+        // take priority to run again.
+        let mut detail = secondary_state.detail.lock().unwrap();
+        detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
+    }
+
+    async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
+        let mut result = SchedulingResult {
+            jobs: Vec::new(),
+            want_interval: None,
+        };
+
+        // Step 1: identify some tenants that we may work on
+        let mut tenants: Vec<Arc<SecondaryTenant>> = Vec::new();
+        self.tenant_manager
+            .foreach_secondary_tenants(|_id, secondary_state| {
+                tenants.push(secondary_state.clone());
+            });
+
+        // Step 2: filter out tenants which are not yet elegible to run
+        let now = Instant::now();
+        result.jobs = tenants
+            .into_iter()
+            .filter_map(|c| {
+                let (last_download, next_download) = {
+                    let mut detail = c.detail.lock().unwrap();
+
+                    if !detail.config.warm {
+                        // Downloads are disabled for this tenant
+                        detail.next_download = None;
+                        return None;
+                    }
+
+                    if detail.next_download.is_none() {
+                        // Initialize with a jitter: this spreads initial downloads on startup
+                        // or mass-attach across our freshen interval.
+                        let jittered_period =
+                            rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
+                        detail.next_download = Some(now.checked_add(jittered_period).expect(
+                        "Using our constant, which is known to be small compared with clock range",
+                    ));
+                    }
+                    (detail.last_download, detail.next_download.unwrap())
+                };
+
+                if now < next_download {
+                    Some(PendingDownload {
+                        secondary_state: c,
+                        last_download,
+                        target_time: Some(next_download),
+                        period: Some(DOWNLOAD_FRESHEN_INTERVAL),
+                    })
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        // Step 3: sort by target execution time to run most urgent first.
+        result.jobs.sort_by_key(|j| j.target_time);
+
+        result
+    }
+
+    fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
+        let tenant_shard_id = command.get_tenant_shard_id();
+
+        let tenant = self
+            .tenant_manager
+            .get_secondary_tenant_shard(*tenant_shard_id);
+        let Some(tenant) = tenant else {
+            {
+                return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
+            }
+        };
+
+        Ok(PendingDownload {
+            target_time: None,
+            period: None,
+            last_download: None,
+            secondary_state: tenant,
+        })
+    }
+
+    fn spawn(
+        &mut self,
+        join_set: &mut JoinSet<()>,
+        result_tx: tokio::sync::mpsc::UnboundedSender<CompleteDownload>,
+        job: PendingDownload,
+    ) -> RunningDownload {
+        let PendingDownload {
+            secondary_state,
+            last_download,
+            target_time,
+            period,
+        } = job;
+
+        let (completion, barrier) = utils::completion::channel();
+        let remote_storage = self.remote_storage.clone();
+        let conf = self.tenant_manager.get_conf();
+        let tenant_shard_id = *secondary_state.get_tenant_shard_id();
+        join_set.spawn(async move {
+            let _completion = completion;
+
+            if let Err(e) = TenantDownloader::new(conf, &remote_storage, &secondary_state)
+                .download()
+                .await
+            {
+                tracing::info!("Failed to freshen secondary content: {e:#}")
+            };
+
+            // If the job had a target execution time, we may check our final execution
+            // time against that for observability purposes.
+            if let (Some(target_time), Some(period)) = (target_time, period) {
+                // Only track execution lag if this isn't our first download: otherwise, it is expected
+                // that execution will have taken longer than our configured interval, for example
+                // when starting up a pageserver and 
+                if last_download.is_some() {
+                    // Elapsed time includes any scheduling lag as well as the execution of the job
+                    let elapsed = Instant::now().duration_since(target_time);
+
+                    warn_when_period_overrun(
+                        elapsed,
+                        period,
+                        BackgroundLoopKind::SecondaryDownload,
+                    );
+                }
+            }
+
+            result_tx
+                .send(CompleteDownload {
+                    secondary_state,
+                    completed_at: Instant::now(),
+                })
+                .ok();
+        }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())));
+        RunningDownload { barrier }
+    }
+}
+
+pub(super) async fn downloader_task(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) {
+    // TODO: separate config for downloads
+    let concurrency = tenant_manager.get_conf().heatmap_upload_concurrency;
+
+    let generator = SecondaryDownloader {
+        tenant_manager,
+        remote_storage,
+    };
+    let mut scheduler = Scheduler::new(generator, concurrency);
+
+    scheduler
+        .run(command_queue, background_jobs_can_start, cancel)
+        .instrument(info_span!("secondary_downloads"))
+        .await
+}
+
+/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
+async fn init_timeline_state(
+    conf: &'static PageServerConf,
+    tenant_shard_id: &TenantShardId,
+    heatmap: &HeatMapTimeline,
+) -> SecondaryDetailTimeline {
+    let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
+    let mut detail = SecondaryDetailTimeline::default();
+
+    let mut dir = match tokio::fs::read_dir(&timeline_path).await {
+        Ok(d) => d,
+        Err(e) => {
+            if e.kind() == std::io::ErrorKind::NotFound {
+                let context = format!("Creating timeline directory {timeline_path}");
+                tracing::info!("{}", context);
+                tokio::fs::create_dir_all(&timeline_path)
+                    .await
+                    .fatal_err(&context);
+
+                // No entries to report: drop out.
+                return detail;
+            } else {
+                on_fatal_io_error(&e, &format!("Reading timeline dir {timeline_path}"));
+            }
+        }
+    };
+
+    let heatmap_metadata: HashMap<_, _> = heatmap.layers.iter().map(|l| (&l.name, l)).collect();
+
+    while let Some(dentry) = dir
+        .next_entry()
+        .await
+        .fatal_err(&format!("Listing {timeline_path}"))
+    {
+        let dentry_file_name = dentry.file_name();
+        let file_name = dentry_file_name.to_string_lossy();
+        let local_meta = dentry.metadata().await.fatal_err(&format!(
+            "Read metadata on {}",
+            dentry.path().to_string_lossy()
+        ));
+
+        // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
+        if file_name == METADATA_FILE_NAME {
+            continue;
+        }
+
+        match LayerFileName::from_str(&file_name) {
+            Ok(name) => {
+                let remote_meta = heatmap_metadata.get(&name);
+                match remote_meta {
+                    Some(remote_meta) => {
+                        // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
+                        if local_meta.len() != remote_meta.metadata.file_size {
+                            // This should not happen, because we do crashsafe write-then-rename when downloading
+                            // layers, and layers in remote storage are immutable.  Remove the local file because
+                            // we cannot trust it.
+                            tracing::warn!(
+                                "Removing local layer {name} with unexpected local size {} != {}",
+                                local_meta.len(),
+                                remote_meta.metadata.file_size
+                            );
+                        } else {
+                            // We expect the access time to be initialized immediately afterwards, when
+                            // the latest heatmap is applied to the state.
+                            detail.on_disk_layers.insert(
+                                name.clone(),
+                                OnDiskState::new(
+                                    conf,
+                                    tenant_shard_id,
+                                    &heatmap.timeline_id,
+                                    name,
+                                    LayerFileMetadata::from(&remote_meta.metadata),
+                                    remote_meta.access_time,
+                                ),
+                            );
+                        }
+                    }
+                    None => {
+                        // FIXME: consider some optimization when transitioning from attached to secondary: maybe
+                        // wait until we have seen a heatmap that is more recent than the most recent on-disk state?  Otherwise
+                        // we will end up deleting any layers which were created+uploaded more recently than the heatmap.
+                        tracing::info!(
+                            "Removing secondary local layer {} because it's absent in heatmap",
+                            name
+                        );
+                        tokio::fs::remove_file(&dentry.path())
+                            .await
+                            .or_else(fs_ext::ignore_not_found)
+                            .fatal_err(&format!(
+                                "Removing layer {}",
+                                dentry.path().to_string_lossy()
+                            ));
+                    }
+                }
+            }
+            Err(_) => {
+                // Ignore it.
+                tracing::warn!("Unexpected file in timeline directory: {file_name}");
+            }
+        }
+    }
+
+    detail
+}
+
+/// This type is a convenience to group together the various functions involved in
+/// freshening a secondary tenant.
+struct TenantDownloader<'a> {
+    conf: &'static PageServerConf,
+    remote_storage: &'a GenericRemoteStorage,
+    secondary_state: &'a SecondaryTenant,
+}
+
+impl<'a> TenantDownloader<'a> {
+    fn new(
+        conf: &'static PageServerConf,
+        remote_storage: &'a GenericRemoteStorage,
+        secondary_state: &'a SecondaryTenant,
+    ) -> Self {
+        Self {
+            conf,
+            remote_storage,
+            secondary_state,
+        }
+    }
+
+    async fn download(&self) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_id();
+
+        // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
+        // cover our access to local storage.
+        let Ok(_guard) = self.secondary_state.gate.enter() else {
+            // Shutting down
+            return Ok(());
+        };
+
+        debug_assert_current_span_has_tenant_id();
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        // Download the tenant's heatmap
+        let heatmap_bytes = tokio::select!(
+            bytes = self.download_heatmap() => {bytes?},
+            _ = self.secondary_state.cancel.cancelled() => return Ok(())
+        );
+
+        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
+
+        // Save the heatmap: this will be useful on restart, allowing us to reconstruct
+        // layer metadata without having to re-download it.
+        let heatmap_path = self
+            .conf
+            .tenant_path(tenant_shard_id)
+            .join(HEATMAP_BASENAME);
+
+        let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
+        let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
+        let heatmap_path_bg = heatmap_path.clone();
+        tokio::task::spawn_blocking(move || {
+            tokio::runtime::Handle::current().block_on(async move {
+                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
+            })
+        })
+        .await?
+        .maybe_fatal_err(&context_msg)
+        .with_context(|| context_msg)?;
+
+        tracing::debug!("Wrote local heatmap to {}", heatmap_path);
+
+        // Download the layers in the heatmap
+        for timeline in heatmap.timelines {
+            if self.secondary_state.cancel.is_cancelled() {
+                return Ok(());
+            }
+
+            let timeline_id = timeline.timeline_id;
+            self.download_timeline(timeline)
+                .instrument(tracing::info_span!(
+                    "secondary_download_timeline",
+                    tenant_id=%tenant_shard_id.tenant_id,
+                    shard_id=%tenant_shard_id.shard_slug(),
+                    %timeline_id
+                ))
+                .await?;
+        }
+
+        Ok(())
+    }
+
+    async fn download_heatmap(&self) -> anyhow::Result<Vec<u8>> {
+        debug_assert_current_span_has_tenant_id();
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        // TODO: make download conditional on ETag having changed since last download
+        // (https://github.com/neondatabase/neon/issues/6199)
+        tracing::debug!("Downloading heatmap for secondary tenant",);
+
+        let heatmap_path = remote_heatmap_path(tenant_shard_id);
+        let download = self.remote_storage.download(&heatmap_path).await?;
+        let mut heatmap_bytes = Vec::new();
+        let mut body = tokio_util::io::StreamReader::new(download.download_stream);
+        let _size = tokio::io::copy(&mut body, &mut heatmap_bytes)
+            .await
+            .with_context(|| format!("download heatmap {heatmap_path:?}"))?;
+
+        SECONDARY_MODE.download_heatmap.inc();
+
+        Ok(heatmap_bytes)
+    }
+
+    async fn download_timeline(&self, timeline: HeatMapTimeline) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        let timeline_path = self
+            .conf
+            .timeline_path(tenant_shard_id, &timeline.timeline_id);
+
+        // Accumulate updates to the state
+        let mut touched = Vec::new();
+
+        // Clone a view of what layers already exist on disk
+        let timeline_state = self
+            .secondary_state
+            .detail
+            .lock()
+            .unwrap()
+            .timelines
+            .get(&timeline.timeline_id)
+            .cloned();
+
+        let timeline_state = match timeline_state {
+            Some(t) => t,
+            None => {
+                // We have no existing state: need to scan local disk for layers first.
+                let timeline_state =
+                    init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
+
+                // Re-acquire detail lock now that we're done with async load from local FS
+                self.secondary_state
+                    .detail
+                    .lock()
+                    .unwrap()
+                    .timelines
+                    .insert(timeline.timeline_id, timeline_state.clone());
+                timeline_state
+            }
+        };
+
+        let layers_in_heatmap = timeline
+            .layers
+            .iter()
+            .map(|l| &l.name)
+            .collect::<HashSet<_>>();
+        let layers_on_disk = timeline_state
+            .on_disk_layers
+            .iter()
+            .map(|l| l.0)
+            .collect::<HashSet<_>>();
+
+        // Remove on-disk layers that are no longer present in heatmap
+        for layer in layers_on_disk.difference(&layers_in_heatmap) {
+            let local_path = timeline_path.join(layer.to_string());
+            tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
+            tokio::fs::remove_file(&local_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)?;
+        }
+
+        // Download heatmap layers that are not present on local disk, or update their
+        // access time if they are already present.
+        for layer in timeline.layers {
+            if self.secondary_state.cancel.is_cancelled() {
+                return Ok(());
+            }
+
+            // Existing on-disk layers: just update their access time.
+            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
+                tracing::debug!("Layer {} is already on disk", layer.name);
+                if on_disk.layer.metadata() != LayerFileMetadata::from(&layer.metadata)
+                    || on_disk.access_time != layer.access_time
+                {
+                    // We already have this layer on disk.  Update its access time.
+                    tracing::debug!(
+                        "Access time updated for layer {}: {} -> {}",
+                        layer.name,
+                        strftime(&on_disk.access_time),
+                        strftime(&layer.access_time)
+                    );
+                    touched.push(layer);
+                }
+                continue;
+            } else {
+                tracing::debug!("Layer {} not present on disk yet", layer.name);
+            }
+
+            // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
+            // recently than it was evicted.
+            if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
+                if &layer.access_time > evicted_at {
+                    tracing::info!(
+                        "Re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                        layer.name,
+                        strftime(&layer.access_time),
+                        strftime(evicted_at)
+                    );
+                } else {
+                    tracing::trace!(
+                        "Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                        layer.name,
+                        strftime(&layer.access_time),
+                        strftime(evicted_at)
+                    );
+                    continue;
+                }
+            }
+
+            match download_layer_file(
+                self.conf,
+                self.remote_storage,
+                *tenant_shard_id,
+                timeline.timeline_id,
+                &layer.name,
+                &LayerFileMetadata::from(&layer.metadata),
+                &self.secondary_state.cancel,
+            )
+            .await
+            {
+                Ok(downloaded_bytes) => {
+                    if downloaded_bytes != layer.metadata.file_size {
+                        let local_path = timeline_path.join(layer.name.to_string());
+
+                        tracing::error!(
+                            "Downloaded layer {} with unexpected size {} != {}",
+                            layer.name,
+                            downloaded_bytes,
+                            layer.metadata.file_size
+                        );
+
+                        tokio::fs::remove_file(&local_path)
+                            .await
+                            .or_else(fs_ext::ignore_not_found)?;
+                    }
+
+                    SECONDARY_MODE.download_layer.inc();
+                    touched.push(layer)
+                }
+                Err(e) => {
+                    // No retries here: secondary downloads don't have to succeed: if they fail we just proceed and expect
+                    // that on some future call to freshen the download will work.
+                    // TODO: refine this behavior.
+                    tracing::info!("Failed to download layer {}: {}", layer.name, e);
+                }
+            }
+        }
+
+        // Write updates to state to record layers we just downloaded or touched.
+        {
+            let mut detail = self.secondary_state.detail.lock().unwrap();
+            let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
+
+            tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
+
+            for t in touched {
+                use std::collections::hash_map::Entry;
+                match timeline_detail.on_disk_layers.entry(t.name.clone()) {
+                    Entry::Occupied(mut v) => {
+                        v.get_mut().access_time = t.access_time;
+                    }
+                    Entry::Vacant(e) => {
+                        e.insert(OnDiskState::new(
+                            self.conf,
+                            tenant_shard_id,
+                            &timeline.timeline_id,
+                            t.name,
+                            LayerFileMetadata::from(&t.metadata),
+                            t.access_time,
+                        ));
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -7,35 +7,55 @@ use std::{
 use crate::{
    metrics::SECONDARY_MODE,
    tenant::{
-        config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
-        secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
+        config::AttachmentMode,
+        mgr::TenantManager,
+        remote_timeline_client::remote_heatmap_path,
+        span::debug_assert_current_span_has_tenant_id,
+        tasks::{warn_when_period_overrun, BackgroundLoopKind},
+        Tenant,
    },
 };

 use md5;
 use pageserver_api::shard::TenantShardId;
+use rand::Rng;
 use remote_storage::GenericRemoteStorage;

+use super::{
+    scheduler::{
+        yielding_loop, HasBarrier, JobGenerator, SchedulingResult, TenantBackgroundJobs,
+        TenantScoped,
+    },
+    CommandRequest,
+};
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
-use tracing::instrument;
+use tracing::{info_span, instrument, Instrument};
 use utils::{backoff, completion::Barrier};

-use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
-
-/// Period between heatmap uploader walking Tenants to look for work to do.
-/// If any tenants have a heatmap upload period lower than this, it will be adjusted
-/// downward to match.
-const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
-const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);
+use super::{heatmap::HeatMapTenant, UploadCommand};

 struct WriteInProgress {
    barrier: Barrier,
 }

+impl HasBarrier for WriteInProgress {
+    fn get_barrier(&self) -> Barrier {
+        self.barrier.clone()
+    }
+}
+
 struct UploadPending {
    tenant: Arc<Tenant>,
    last_digest: Option<md5::Digest>,
+    target_time: Option<Instant>,
+    period: Option<Duration>,
+}
+
+impl TenantScoped for UploadPending {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.tenant.get_tenant_shard_id()
+    }
 }

 struct WriteComplete {
@@ -45,6 +65,18 @@ struct WriteComplete {
    next_upload: Option<Instant>,
 }

+impl TenantScoped for WriteComplete {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        &self.tenant_shard_id
+    }
+}
+
+impl TenantScoped for Tenant {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.get_tenant_shard_id()
+    }
+}
+
 /// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
 /// when we last did a write.  We only populate this after doing at least one
 /// write for a tenant -- this avoids holding state for tenants that have
@@ -77,258 +109,135 @@ struct HeatmapUploader {
    cancel: CancellationToken,

    tenants: HashMap<TenantShardId, UploaderTenantState>,
-
-    /// Tenants with work to do, for which tasks should be spawned as soon as concurrency
-    /// limits permit it.
-    tenants_pending: std::collections::VecDeque<UploadPending>,
-
-    /// Tenants for which a task in `tasks` has been spawned.
-    tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
-
-    tasks: JoinSet<()>,
-
-    /// Channel for our child tasks to send results to: we use a channel for results rather than
-    /// just getting task results via JoinSet because we need the channel's recv() "sleep until something
-    /// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
-    /// behavior.
-    task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
-    task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
-
-    concurrent_uploads: usize,
-
-    scheduling_interval: Duration,
 }

-/// The uploader task runs a loop that periodically wakes up and schedules tasks for
-/// tenants that require an upload, or handles any commands that have been sent into
-/// `command_queue`.  No I/O is done in this loop: that all happens in the tasks we
-/// spawn.
-///
-/// Scheduling iterations are somewhat infrequent.  However, each one will enqueue
-/// all tenants that require an upload, and in between scheduling iterations we will
-/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
-///
-/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
-/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
-/// we might block waiting on a Tenant.
 pub(super) async fn heatmap_uploader_task(
    tenant_manager: Arc<TenantManager>,
    remote_storage: GenericRemoteStorage,
-    mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
+    command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
    background_jobs_can_start: Barrier,
    cancel: CancellationToken,
-) -> anyhow::Result<()> {
-    let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
+) {
+    let concurrency = tenant_manager.get_conf().heatmap_upload_concurrency;

-    let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
-
-    let mut uploader = HeatmapUploader {
+    let generator = HeatmapUploader {
        tenant_manager,
        remote_storage,
        cancel: cancel.clone(),
-        tasks: JoinSet::new(),
        tenants: HashMap::new(),
-        tenants_pending: std::collections::VecDeque::new(),
-        tenants_uploading: HashMap::new(),
-        task_result_tx: result_tx,
-        task_result_rx: result_rx,
-        concurrent_uploads,
-        scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
    };
+    let mut scheduler = Scheduler::new(generator, concurrency);

-    tracing::info!("Waiting for background_jobs_can start...");
-    background_jobs_can_start.wait().await;
-    tracing::info!("background_jobs_can is ready, proceeding.");
-
-    while !cancel.is_cancelled() {
-        // Look for new work: this is relatively expensive because we have to go acquire the lock on
-        // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
-        // require an upload.
-        uploader.schedule_iteration().await?;
-
-        // Between scheduling iterations, we will:
-        //  - Drain any complete tasks and spawn pending tasks
-        //  - Handle incoming administrative commands
-        //  - Check our cancellation token
-        let next_scheduling_iteration = Instant::now()
-            .checked_add(uploader.scheduling_interval)
-            .unwrap_or_else(|| {
-                tracing::warn!(
-                    "Scheduling interval invalid ({}s), running immediately!",
-                    uploader.scheduling_interval.as_secs_f64()
-                );
-                Instant::now()
-            });
-        loop {
-            tokio::select! {
-                _ = cancel.cancelled() => {
-                    // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
-                    tracing::info!("Heatmap uploader joining tasks");
-                    while let Some(_r) = uploader.tasks.join_next().await {};
-                    tracing::info!("Heatmap uploader terminating");
-
-                    break;
-                },
-                _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
-                    tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
-                    break;},
-                cmd = command_queue.recv() => {
-                    tracing::debug!("heatmap_uploader_task: woke for command queue");
-                    let cmd = match cmd {
-                        Some(c) =>c,
-                        None => {
-                            // SecondaryController was destroyed, and this has raced with
-                            // our CancellationToken
-                            tracing::info!("Heatmap uploader terminating");
-                            cancel.cancel();
-                            break;
-                        }
-                    };
-
-                    let CommandRequest{
-                        response_tx,
-                        payload
-                    } = cmd;
-                    uploader.handle_command(payload, response_tx);
-                },
-                _ = uploader.process_next_completion() => {
-                    if !cancel.is_cancelled() {
-                        uploader.spawn_pending();
-                    }
-                }
-            }
-        }
-    }
-
-    Ok(())
+    scheduler
+        .run(command_queue, background_jobs_can_start, cancel)
+        .instrument(info_span!("heatmap_uploader"))
+        .await
 }

-impl HeatmapUploader {
-    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
-    async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
+type Scheduler = TenantBackgroundJobs<
+    HeatmapUploader,
+    UploadPending,
+    WriteInProgress,
+    WriteComplete,
+    UploadCommand,
+>;
+
+#[async_trait::async_trait]
+impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
+    for HeatmapUploader
+{
+    async fn schedule(&mut self) -> SchedulingResult<UploadPending> {
        // Cull any entries in self.tenants whose Arc<Tenant> is gone
        self.tenants
            .retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());

-        // The priority order of previously scheduled work may be invalidated by current state: drop
-        // all pending work (it will be re-scheduled if still needed)
-        self.tenants_pending.clear();
-
-        // Used a fixed 'now' through the following loop, for efficiency and fairness.
        let now = Instant::now();

-        // While iterating over the potentially-long list of tenants, we will periodically yield
-        // to avoid blocking executor.
-        const YIELD_ITERATIONS: usize = 1000;
+        let mut result = SchedulingResult {
+            jobs: Vec::new(),
+            want_interval: None,
+        };

-        // Iterate over tenants looking for work to do.
        let tenants = self.tenant_manager.get_attached_active_tenant_shards();
-        for (i, tenant) in tenants.into_iter().enumerate() {
-            // Process is shutting down, drop out
-            if self.cancel.is_cancelled() {
-                return Ok(());
-            }

-            // Skip tenants that already have a write in flight
-            if self
-                .tenants_uploading
-                .contains_key(tenant.get_tenant_shard_id())
-            {
-                continue;
-            }
+        yielding_loop(1000, &self.cancel, tenants.into_iter(), |tenant| {
+            let period = match tenant.get_heatmap_period() {
+                None => {
+                    // Heatmaps are disabled for this tenant
+                    return;
+                }
+                Some(period) => {
+                    // If any tenant has asked for uploads more frequent than our scheduling interval,
+                    // reduce it to match so that we can keep up.  This is mainly useful in testing, where
+                    // we may set rather short intervals.
+                    result.want_interval = match result.want_interval {
+                        None => Some(period),
+                        Some(existing) => Some(std::cmp::min(period, existing)),
+                    };

-            self.maybe_schedule_upload(&now, tenant);
+                    period
+                }
+            };

-            if i + 1 % YIELD_ITERATIONS == 0 {
-                tokio::task::yield_now().await;
-            }
-        }
-
-        // Spawn tasks for as many of our pending tenants as we can.
-        self.spawn_pending();
-
-        Ok(())
-    }
-
-    ///
-    /// Cancellation: this method is cancel-safe.
-    async fn process_next_completion(&mut self) {
-        match self.task_result_rx.recv().await {
-            Some(r) => {
-                self.on_completion(r);
-            }
-            None => {
-                unreachable!("Result sender is stored on Self");
-            }
-        }
-    }
-
-    /// The 'maybe' refers to the tenant's state: whether it is configured
-    /// for heatmap uploads at all, and whether sufficient time has passed
-    /// since the last upload.
-    fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
-        match tenant.get_heatmap_period() {
-            None => {
-                // Heatmaps are disabled for this tenant
+            // Stale attachments do not upload anything: if we are in this state, there is probably some
+            // other attachment in mode Single or Multi running on another pageserver, and we don't
+            // want to thrash and overwrite their heatmap uploads.
+            if tenant.get_attach_mode() == AttachmentMode::Stale {
                return;
            }
-            Some(period) => {
-                // If any tenant has asked for uploads more frequent than our scheduling interval,
-                // reduce it to match so that we can keep up.  This is mainly useful in testing, where
-                // we may set rather short intervals.
-                if period < self.scheduling_interval {
-                    self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
-                }
+
+            // Create an entry in self.tenants if one doesn't already exist: this will later be updated
+            // with the completion time in on_completion.
+            let state = self
+                .tenants
+                .entry(*tenant.get_tenant_shard_id())
+                .or_insert_with(|| {
+                    let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
+
+                    UploaderTenantState {
+                        tenant: Arc::downgrade(&tenant),
+                        last_upload: None,
+                        next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
+                        last_digest: None,
+                    }
+                });
+
+            // Decline to do the upload if insufficient time has passed
+            if state.next_upload.map(|nu| nu > now).unwrap_or(false) {
+                return;
            }
-        }

-        // Stale attachments do not upload anything: if we are in this state, there is probably some
-        // other attachment in mode Single or Multi running on another pageserver, and we don't
-        // want to thrash and overwrite their heatmap uploads.
-        if tenant.get_attach_mode() == AttachmentMode::Stale {
-            return;
-        }
-
-        // Create an entry in self.tenants if one doesn't already exist: this will later be updated
-        // with the completion time in on_completion.
-        let state = self
-            .tenants
-            .entry(*tenant.get_tenant_shard_id())
-            .or_insert_with(|| UploaderTenantState {
-                tenant: Arc::downgrade(&tenant),
-                last_upload: None,
-                next_upload: Some(Instant::now()),
-                last_digest: None,
+            let last_digest = state.last_digest;
+            result.jobs.push(UploadPending {
+                tenant,
+                last_digest,
+                target_time: state.next_upload,
+                period: Some(period),
            });
+        })
+        .await
+        .ok();

-        // Decline to do the upload if insufficient time has passed
-        if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
-            return;
-        }
+        result
+    }

-        let last_digest = state.last_digest;
-        self.tenants_pending.push_back(UploadPending {
+    fn spawn(
+        &mut self,
+        join_set: &mut JoinSet<()>,
+        result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
+        job: UploadPending,
+    ) -> WriteInProgress {
+        let UploadPending {
            tenant,
            last_digest,
-        })
-    }
+            target_time,
+            period,
+        } = job;

-    fn spawn_pending(&mut self) {
-        while !self.tenants_pending.is_empty()
-            && self.tenants_uploading.len() < self.concurrent_uploads
-        {
-            // unwrap: loop condition includes !is_empty()
-            let pending = self.tenants_pending.pop_front().unwrap();
-            self.spawn_upload(pending.tenant, pending.last_digest);
-        }
-    }
-
-    fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
        let remote_storage = self.remote_storage.clone();
-        let tenant_shard_id = *tenant.get_tenant_shard_id();
        let (completion, barrier) = utils::completion::channel();
-        let result_tx = self.task_result_tx.clone();
-        self.tasks.spawn(async move {
+        let tenant_shard_id = *tenant.get_tenant_shard_id();
+        join_set.spawn(async move {
            // Guard for the barrier in [`WriteInProgress`]
            let _completion = completion;

@@ -362,6 +271,16 @@ impl HeatmapUploader {
            };

            let now = Instant::now();
+
+            // If the job had a target execution time, we may check our final execution
+            // time against that for observability purposes.
+            if let (Some(target_time), Some(period)) = (target_time, period) {
+                // Elapsed time includes any scheduling lag as well as the execution of the job
+                let elapsed = now.duration_since(target_time);
+
+                warn_when_period_overrun(elapsed, period, BackgroundLoopKind::HeatmapUpload);
+            }
+
            let next_upload = tenant
                .get_heatmap_period()
                .and_then(|period| now.checked_add(period));
@@ -374,10 +293,28 @@ impl HeatmapUploader {
                    next_upload,
                })
                .ok();
-        });
+        }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())));
+        WriteInProgress { barrier }
+    }

-        self.tenants_uploading
-            .insert(tenant_shard_id, WriteInProgress { barrier });
+    fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
+        let tenant_shard_id = command.get_tenant_shard_id();
+
+        tracing::info!(
+            tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+            "Starting heatmap write on command");
+        let tenant = self
+            .tenant_manager
+            .get_attached_tenant_shard(*tenant_shard_id, true)
+            .map_err(|e| anyhow::anyhow!(e))?;
+
+        Ok(UploadPending {
+            // Ignore our state for last digest: this forces an upload even if nothing has changed
+            last_digest: None,
+            tenant,
+            target_time: None,
+            period: None,
+        })
    }

    #[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
@@ -389,7 +326,6 @@ impl HeatmapUploader {
            digest,
            next_upload,
        } = completion;
-        self.tenants_uploading.remove(&tenant_shard_id);
        use std::collections::hash_map::Entry;
        match self.tenants.entry(tenant_shard_id) {
            Entry::Vacant(_) => {
@@ -402,69 +338,6 @@ impl HeatmapUploader {
            }
        }
    }
-
-    fn handle_command(
-        &mut self,
-        command: UploadCommand,
-        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
-    ) {
-        match command {
-            UploadCommand::Upload(tenant_shard_id) => {
-                // If an upload was ongoing for this tenant, let it finish first.
-                let barrier = if let Some(writing_state) =
-                    self.tenants_uploading.get(&tenant_shard_id)
-                {
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Waiting for heatmap write to complete");
-                    writing_state.barrier.clone()
-                } else {
-                    // Spawn the upload then immediately wait for it.  This will block processing of other commands and
-                    // starting of other background work.
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Starting heatmap write on command");
-                    let tenant = match self
-                        .tenant_manager
-                        .get_attached_tenant_shard(tenant_shard_id, true)
-                    {
-                        Ok(t) => t,
-                        Err(e) => {
-                            // Drop result of send: we don't care if caller dropped their receiver
-                            drop(response_tx.send(CommandResponse {
-                                result: Err(e.into()),
-                            }));
-                            return;
-                        }
-                    };
-                    self.spawn_upload(tenant, None);
-                    let writing_state = self
-                        .tenants_uploading
-                        .get(&tenant_shard_id)
-                        .expect("We just inserted this");
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Waiting for heatmap upload to complete");
-
-                    writing_state.barrier.clone()
-                };
-
-                // This task does no I/O: it only listens for a barrier's completion and then
-                // sends to the command response channel.  It is therefore safe to spawn this without
-                // any gates/task_mgr hooks.
-                tokio::task::spawn(async move {
-                    barrier.wait().await;
-
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Heatmap upload complete");
-
-                    // Drop result of send: we don't care if caller dropped their receiver
-                    drop(response_tx.send(CommandResponse { result: Ok(()) }))
-                });
-            }
-        }
-    }
 }

 enum UploadHeatmapOutcome {
@@ -487,7 +360,6 @@ enum UploadHeatmapError {

 /// The inner upload operation.  This will skip if `last_digest` is Some and matches the digest
 /// of the object we would have uploaded.
-#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
 async fn upload_tenant_heatmap(
    remote_storage: GenericRemoteStorage,
    tenant: &Arc<Tenant>,
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -0,0 +1,380 @@
+use async_trait;
+use std::{
+    collections::HashMap,
+    marker::PhantomData,
+    time::{Duration, Instant},
+};
+
+use pageserver_api::shard::TenantShardId;
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use utils::completion::Barrier;
+
+use super::{CommandRequest, CommandResponse};
+
+/// Scheduling interval is the time between calls to JobGenerator::schedule.
+/// When we schedule jobs, the job generator may provide a hint of its preferred
+/// interval, which we will respect within these intervals.
+const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
+const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
+
+#[derive(thiserror::Error, Debug)]
+pub(super) enum YieldingLoopError {
+    #[error("Cancelled")]
+    Cancelled,
+}
+
+/// Helper for long synchronous loops, e.g. over all tenants in the system.  Periodically
+/// yields to avoid blocking the executor, and after resuming checks the provided
+/// cancellation token to drop out promptly on shutdown.
+pub(super) async fn yielding_loop<I, T, F>(
+    interval: usize,
+    cancel: &CancellationToken,
+    iter: I,
+    mut visitor: F,
+) -> Result<(), YieldingLoopError>
+where
+    I: Iterator<Item = T>,
+    F: FnMut(T),
+{
+    for (i, item) in iter.enumerate() {
+        visitor(item);
+
+        if i + 1 % interval == 0 {
+            tokio::task::yield_now().await;
+            if cancel.is_cancelled() {
+                return Err(YieldingLoopError::Cancelled);
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Scheduling helper for background work across many tenants.
+///
+/// PE: a 'PEnding' type for job descriptors that are ready to run
+/// PR: a 'Running' type for jobs that have been spawned
+/// C : a 'Completion' type that spawned jobs will send when they finish
+pub(super) struct TenantBackgroundJobs<G, PE, PR, C, CMD>
+where
+    C: TenantScoped,
+    PE: TenantScoped,
+    PR: HasBarrier,
+    G: JobGenerator<PE, PR, C, CMD>,
+{
+    generator: G,
+
+    /// Ready to run.  Will progress to `running` once concurrent limit is satisfied, or
+    /// be removed on next scheduling pass.
+    pending: std::collections::VecDeque<PE>,
+
+    /// Tasks currently running in Self::tasks for these tenants.  Check this map
+    /// before pushing more work into pending for the same tenant.
+    running: HashMap<TenantShardId, PR>,
+
+    tasks: JoinSet<()>,
+
+    /// Channel for our child tasks to send results to: we use a channel for results rather than
+    /// just getting task results via JoinSet because we need the channel's recv() "sleep until something
+    /// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
+    /// behavior.
+    task_result_tx: tokio::sync::mpsc::UnboundedSender<C>,
+    task_result_rx: tokio::sync::mpsc::UnboundedReceiver<C>,
+
+    concurrency: usize,
+
+    /// How often we would like schedule_interval to be called.
+    pub(super) scheduling_interval: Duration,
+
+    _phantom: PhantomData<(PE, PR, C, CMD)>,
+}
+
+/// For types that logically belong to a particular tenant shard, and can
+/// provide its ID on demand.
+pub(super) trait TenantScoped {
+    fn get_tenant_shard_id(&self) -> &TenantShardId;
+}
+
+/// For types that contain a Barrier that may be waited on
+pub(super) trait HasBarrier {
+    fn get_barrier(&self) -> Barrier;
+}
+
+pub(super) struct SchedulingResult<PE> {
+    pub(super) jobs: Vec<PE>,
+    /// The job generator would like to be called again this soon
+    pub(super) want_interval: Option<Duration>,
+}
+
+#[async_trait::async_trait]
+pub(crate) trait JobGenerator<PE, PR, C, CMD>
+where
+    C: TenantScoped,
+    PE: TenantScoped,
+    PR: HasBarrier,
+{
+    /// Called at each scheduling interval.  Return a list of jobs to run, most urgent first.
+    ///
+    /// This function may be expensive (e.g. walk all tenants), but should not do any I/O.
+    /// Implementations should take care to yield the executor periodically if running
+    /// very long loops.
+    ///
+    /// Yielding a job here does _not_ guarantee that it will run: if the queue of pending
+    /// jobs is not drained by the next scheduling interval, pending jobs will be cleared
+    /// and re-generated.
+    async fn schedule(&mut self) -> SchedulingResult<PE>;
+
+    /// Called when a pending job is ready to be run.
+    /// //
+    /// The spawn operation _must_ spawn a task.  The task spawned _must_ send
+    /// its result to the provided result channel (including in error cases).
+    /// TODO: refactor so that implemeter can't violate these invariants.
+    fn spawn(
+        &mut self,
+        join_set: &mut JoinSet<()>,
+        result_tx: tokio::sync::mpsc::UnboundedSender<C>,
+        pending_job: PE,
+    ) -> PR;
+
+    /// Called when a job previously spawned with spawn() transmits its completion
+    fn on_completion(&mut self, completion: C);
+
+    /// Called when a command is received.  A job will be spawned immediately if the return
+    /// value is Some, ignoring concurrency limits and the pending queue.
+    fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PE>;
+}
+
+impl<G, PE, PR, C, CMD> TenantBackgroundJobs<G, PE, PR, C, CMD>
+where
+    C: TenantScoped,
+    PE: TenantScoped,
+    PR: HasBarrier,
+    G: JobGenerator<PE, PR, C, CMD>,
+{
+    pub(super) fn new(generator: G, concurrency: usize) -> Self {
+        let (task_result_tx, task_result_rx) = tokio::sync::mpsc::unbounded_channel();
+
+        Self {
+            generator,
+            pending: std::collections::VecDeque::new(),
+            running: HashMap::new(),
+            tasks: JoinSet::new(),
+            task_result_rx,
+            task_result_tx,
+            concurrency,
+            scheduling_interval: MAX_SCHEDULING_INTERVAL,
+            _phantom: PhantomData,
+        }
+    }
+
+    pub(super) async fn run(
+        &mut self,
+        mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<CMD>>,
+        background_jobs_can_start: Barrier,
+        cancel: CancellationToken,
+    ) {
+        tracing::info!("Waiting for background_jobs_can start...");
+        background_jobs_can_start.wait().await;
+        tracing::info!("background_jobs_can is ready, proceeding.");
+
+        while !cancel.is_cancelled() {
+            // Look for new work: this is relatively expensive because we have to go acquire the lock on
+            // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
+            // require an upload.
+            self.schedule_iteration(&cancel).await;
+
+            if cancel.is_cancelled() {
+                return;
+            }
+
+            // Schedule some work, if concurrency limit permits it
+            self.spawn_pending();
+
+            // Between scheduling iterations, we will:
+            //  - Drain any complete tasks and spawn pending tasks
+            //  - Handle incoming administrative commands
+            //  - Check our cancellation token
+            let next_scheduling_iteration = Instant::now()
+                .checked_add(self.scheduling_interval)
+                .unwrap_or_else(|| {
+                    tracing::warn!(
+                        "Scheduling interval invalid ({}s)",
+                        self.scheduling_interval.as_secs_f64()
+                    );
+                    // unwrap(): this constant is small, cannot fail to add to time unless
+                    // we are close to the end of the universe.
+                    Instant::now().checked_add(MIN_SCHEDULING_INTERVAL).unwrap()
+                });
+            loop {
+                tokio::select! {
+                    _ = cancel.cancelled() => {
+                        tracing::info!("joining tasks");
+                        self.shutdown().await;
+                        tracing::info!("terminating on cancellation token.");
+
+                        break;
+                    },
+                    _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
+                        tracing::debug!("woke for scheduling interval");
+                        break;},
+                    cmd = command_queue.recv() => {
+                        tracing::debug!("woke for command queue");
+                        let cmd = match cmd {
+                            Some(c) =>c,
+                            None => {
+                                // SecondaryController was destroyed, and this has raced with
+                                // our CancellationToken
+                                tracing::info!("terminating on command queue destruction");
+                                cancel.cancel();
+                                break;
+                            }
+                        };
+
+                        let CommandRequest{
+                            response_tx,
+                            payload
+                        } = cmd;
+                        self.handle_command(payload, response_tx);
+                    },
+                    _ = async {
+                        let completion = self.process_next_completion().await;
+                        self.generator.on_completion(completion);
+                        if !cancel.is_cancelled() {
+                            self.spawn_pending();
+                        }
+                     } => {}
+                }
+            }
+        }
+    }
+
+    /// For all pending tenants that are elegible for execution, spawn their task.
+    ///
+    /// Caller provides the spawn operation, we track the resulting execution.
+    ///
+    /// The spawn operation _must_ spawn a task.  The task spawned _must_ send
+    /// its result to the provided result channel (including in error cases).
+    /// TODO: refactor so that caller can't violate these invariants.
+    fn spawn_pending(&mut self) {
+        while !self.pending.is_empty() && self.running.len() < self.concurrency {
+            // unwrap: loop condition includes !is_empty()
+            let pending = self.pending.pop_front().unwrap();
+            let tenant_shard_id = *pending.get_tenant_shard_id();
+            let in_progress =
+                self.generator
+                    .spawn(&mut self.tasks, self.task_result_tx.clone(), pending);
+
+            self.running.insert(tenant_shard_id, in_progress);
+        }
+    }
+
+    /// For administrative commands: skip the pending queue, ignore concurrency limits
+    fn spawn_now(&mut self, job: PE) -> &PR {
+        let tenant_shard_id = *job.get_tenant_shard_id();
+        let in_progress = self
+            .generator
+            .spawn(&mut self.tasks, self.task_result_tx.clone(), job);
+
+        self.running.insert(tenant_shard_id, in_progress);
+        self.running
+            .get(&tenant_shard_id)
+            .expect("We just inserted this")
+    }
+
+    /// Wait until the next task completes, and handle its completion
+    ///
+    /// Cancellation: this method is cancel-safe.
+    async fn process_next_completion(&mut self) -> C {
+        match self.task_result_rx.recv().await {
+            Some(r) => {
+                self.running.remove(r.get_tenant_shard_id());
+                r
+            }
+            None => {
+                unreachable!("Result sender is stored on Self");
+            }
+        }
+    }
+
+    /// Convert the command into a pending job, spawn it, and when the spawned
+    /// job completes, send the result down `response_tx`.
+    fn handle_command(
+        &mut self,
+        cmd: CMD,
+        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
+    ) {
+        let job = match self.generator.on_command(cmd) {
+            Ok(j) => j,
+            Err(e) => {
+                response_tx.send(CommandResponse { result: Err(e) }).ok();
+                return;
+            }
+        };
+
+        let tenant_shard_id = job.get_tenant_shard_id();
+        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
+            barrier
+        } else {
+            let running = self.spawn_now(job);
+            running.get_barrier().clone()
+        };
+
+        // This task does no I/O: it only listens for a barrier's completion and then
+        // sends to the command response channel.  It is therefore safe to spawn this without
+        // any gates/task_mgr hooks.
+        tokio::task::spawn(async move {
+            barrier.wait().await;
+
+            response_tx.send(CommandResponse { result: Ok(()) }).ok();
+        });
+    }
+
+    fn get_running(&self, tenant_shard_id: &TenantShardId) -> Option<Barrier> {
+        self.running.get(tenant_shard_id).map(|r| r.get_barrier())
+    }
+
+    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
+    ///
+    /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
+    ///
+    /// This function resets the pending list: it is assumed that the caller may change their mind about
+    /// which tenants need work between calls to schedule_iteration.
+    async fn schedule_iteration(&mut self, cancel: &CancellationToken) {
+        let SchedulingResult {
+            jobs,
+            want_interval,
+        } = self.generator.schedule().await;
+
+        // Adjust interval based on feedback from the job generator
+        if let Some(want_interval) = want_interval {
+            // Calculation uses second granularity: this scheduler is not intended for high frequency tasks
+            self.scheduling_interval = Duration::from_secs(std::cmp::min(
+                std::cmp::max(MIN_SCHEDULING_INTERVAL.as_secs(), want_interval.as_secs()),
+                MAX_SCHEDULING_INTERVAL.as_secs(),
+            ));
+        }
+
+        // The priority order of previously scheduled work may be invalidated by current state: drop
+        // all pending work (it will be re-scheduled if still needed)
+        self.pending.clear();
+
+        // While iterating over the potentially-long list of tenants, we will periodically yield
+        // to avoid blocking executor.
+        yielding_loop(1000, cancel, jobs.into_iter(), |job| {
+            // Skip tenants that already have a write in flight
+            if !self.running.contains_key(job.get_tenant_shard_id()) {
+                self.pending.push_back(job);
+            }
+        })
+        .await
+        .ok();
+    }
+
+    /// It is the callers responsibility to make sure that the tasks they scheduled
+    /// respect an appropriate cancellation token, to shut down promptly.
+    async fn shutdown(&mut self) {
+        // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
+        while let Some(_r) = self.tasks.join_next().await {}
+    }
+}
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -3,19 +3,20 @@ use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::{
    HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
 };
-use pageserver_api::shard::ShardIndex;
+use pageserver_api::shard::{ShardIndex, TenantShardId};
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
 use std::time::SystemTime;
 use tracing::Instrument;
+use utils::id::TimelineId;
 use utils::lsn::Lsn;
 use utils::sync::heavier_once_cell;

 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::Key;
-use crate::tenant::{remote_timeline_client::LayerFileMetadata, RemoteTimelineClient, Timeline};
+use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};

 use super::delta_layer::{self, DeltaEntry};
 use super::image_layer;
@@ -92,7 +93,7 @@ impl Layer {

        let owner = Layer(Arc::new(LayerInner::new(
            conf,
-            timeline,
+            Arc::downgrade(timeline),
            access_stats,
            desc,
            None,
@@ -105,6 +106,34 @@ impl Layer {
        owner
    }

+    /// A layer which is resident locally in a secondary location: not associated with a live Timeline object.
+    pub(crate) fn for_secondary(
+        conf: &'static PageServerConf,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        file_name: LayerFileName,
+        metadata: LayerFileMetadata,
+    ) -> Self {
+        let desc = PersistentLayerDesc::from_filename(
+            *tenant_shard_id,
+            *timeline_id,
+            file_name,
+            metadata.file_size(),
+        );
+
+        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
+
+        Layer(Arc::new(LayerInner::new(
+            conf,
+            Weak::default(),
+            access_stats,
+            desc,
+            None,
+            metadata.generation,
+            metadata.shard,
+        )))
+    }
+
    /// Creates a Layer value for a file we know to be resident in timeline directory.
    pub(crate) fn for_resident(
        conf: &'static PageServerConf,
@@ -133,7 +162,7 @@ impl Layer {

            LayerInner::new(
                conf,
-                timeline,
+                Arc::downgrade(timeline),
                access_stats,
                desc,
                Some(inner),
@@ -177,7 +206,7 @@ impl Layer {
            );
            LayerInner::new(
                conf,
-                timeline,
+                Arc::downgrade(timeline),
                access_stats,
                desc,
                Some(inner),
@@ -204,17 +233,14 @@ impl Layer {
    ///
    /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
    /// of download-evict cycle on retry.
-    pub(crate) async fn evict_and_wait(
-        &self,
-        rtc: &RemoteTimelineClient,
-    ) -> Result<(), EvictionError> {
-        self.0.evict_and_wait(rtc).await
+    pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
+        self.0.evict_and_wait().await
    }

    /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
    /// then.
    ///
-    /// On drop, this will cause a call to [`RemoteTimelineClient::schedule_deletion_of_unlinked`].
+    /// On drop, this will cause a call to [`crate::tenant::remote_timeline_client::RemoteTimelineClient::schedule_deletion_of_unlinked`].
    /// This means that the unlinking by [gc] or [compaction] must have happened strictly before
    /// the value this is called on gets dropped.
    ///
@@ -556,7 +582,7 @@ impl Drop for LayerInner {
 impl LayerInner {
    fn new(
        conf: &'static PageServerConf,
-        timeline: &Arc<Timeline>,
+        timeline: Weak<Timeline>,
        access_stats: LayerAccessStats,
        desc: PersistentLayerDesc,
        downloaded: Option<Arc<DownloadedLayer>>,
@@ -564,7 +590,7 @@ impl LayerInner {
        shard: ShardIndex,
    ) -> Self {
        let path = conf
-            .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
+            .timeline_path(&desc.tenant_shard_id, &desc.timeline_id)
            .join(desc.filename().to_string());

        let (inner, version) = if let Some(inner) = downloaded {
@@ -575,12 +601,17 @@ impl LayerInner {
            (heavier_once_cell::OnceCell::default(), 0)
        };

+        let have_remote_client = timeline
+            .upgrade()
+            .map(|t| t.remote_client.is_some())
+            .unwrap_or(false);
+
        LayerInner {
            conf,
            path,
            desc,
-            timeline: Arc::downgrade(timeline),
-            have_remote_client: timeline.remote_client.is_some(),
+            timeline,
+            have_remote_client,
            access_stats,
            wanted_deleted: AtomicBool::new(false),
            wanted_evicted: AtomicBool::new(false),
@@ -606,10 +637,7 @@ impl LayerInner {

    /// Cancellation safe, however dropping the future and calling this method again might result
    /// in a new attempt to evict OR join the previously started attempt.
-    pub(crate) async fn evict_and_wait(
-        &self,
-        _: &RemoteTimelineClient,
-    ) -> Result<(), EvictionError> {
+    pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
        use tokio::sync::broadcast::error::RecvError;

        assert!(self.have_remote_client);
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -45,6 +45,8 @@ pub(crate) enum BackgroundLoopKind {
    ConsumptionMetricsCollectMetrics,
    ConsumptionMetricsSyntheticSizeWorker,
    InitialLogicalSizeCalculation,
+    HeatmapUpload,
+    SecondaryDownload,
 }

 impl BackgroundLoopKind {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1127,12 +1127,11 @@ impl Timeline {
            return Ok(None);
        };

-        let rtc = self
-            .remote_client
+        self.remote_client
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;

-        match local_layer.evict_and_wait(rtc).await {
+        match local_layer.evict_and_wait().await {
            Ok(()) => Ok(Some(true)),
            Err(EvictionError::NotFound) => Ok(Some(false)),
            Err(EvictionError::Downloaded) => Ok(Some(false)),
@@ -4598,11 +4597,6 @@ mod tests {
            .await
            .unwrap();

-        let rtc = timeline
-            .remote_client
-            .clone()
-            .expect("just configured this");
-
        let layer = find_some_layer(&timeline).await;
        let layer = layer
            .keep_resident()
@@ -4611,8 +4605,8 @@ mod tests {
            .expect("should had been resident")
            .drop_eviction_guard();

-        let first = async { layer.evict_and_wait(&rtc).await };
-        let second = async { layer.evict_and_wait(&rtc).await };
+        let first = async { layer.evict_and_wait().await };
+        let second = async { layer.evict_and_wait().await };

        let (first, second) = tokio::join!(first, second);

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -215,13 +215,10 @@ impl Timeline {

        // So, we just need to deal with this.

-        let remote_client = match self.remote_client.as_ref() {
-            Some(c) => c,
-            None => {
-                error!("no remote storage configured, cannot evict layers");
-                return ControlFlow::Continue(());
-            }
-        };
+        if self.remote_client.is_none() {
+            error!("no remote storage configured, cannot evict layers");
+            return ControlFlow::Continue(());
+        }

        let mut js = tokio::task::JoinSet::new();
        {
@@ -274,9 +271,8 @@ impl Timeline {
                };
                let layer = guard.drop_eviction_guard();
                if no_activity_for > p.threshold {
-                    let remote_client = remote_client.clone();
                    // this could cause a lot of allocations in some cases
-                    js.spawn(async move { layer.evict_and_wait(&remote_client).await });
+                    js.spawn(async move { layer.evict_and_wait().await });
                    stats.candidates += 1;
                }
            }
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -326,6 +326,10 @@ class PageserverHttpClient(requests.Session):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
        self.verbose_error(res)

+    def tenant_secondary_download(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download")
+        self.verbose_error(res)
+
    def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
        assert "tenant_id" not in config.keys()
        res = self.put(
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -8,6 +8,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
+    NeonPageserver,
    PgBin,
    wait_for_last_flush_lsn,
 )
@@ -73,14 +74,21 @@ class EvictionEnv:
    layer_size: int
    pgbench_init_lsns: Dict[TenantId, Lsn]

-    def timelines_du(self) -> Tuple[int, int, int]:
+    @property
+    def pageserver(self):
+        """
+        Shortcut for tests that only use one pageserver.
+        """
+        return self.neon_env.pageserver
+
+    def timelines_du(self, pageserver: NeonPageserver) -> Tuple[int, int, int]:
        return poor_mans_du(
-            self.neon_env, [(tid, tlid) for tid, tlid in self.timelines], verbose=False
+            self.neon_env, [(tid, tlid) for tid, tlid in self.timelines], pageserver, verbose=False
        )

-    def du_by_timeline(self) -> Dict[Tuple[TenantId, TimelineId], int]:
+    def du_by_timeline(self, pageserver: NeonPageserver) -> Dict[Tuple[TenantId, TimelineId], int]:
        return {
-            (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)], verbose=True)[0]
+            (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)], pageserver, verbose=True)[0]
            for tid, tlid in self.timelines
        }

@@ -108,7 +116,7 @@ class EvictionEnv:
                    _avg = cur.fetchone()

    def pageserver_start_with_disk_usage_eviction(
-        self, period, max_usage_pct, min_avail_bytes, mock_behavior
+        self, pageserver: NeonPageserver, period, max_usage_pct, min_avail_bytes, mock_behavior
    ):
        disk_usage_config = {
            "period": period,
@@ -119,7 +127,12 @@ class EvictionEnv:

        enc = toml.TomlEncoder()

-        self.neon_env.pageserver.start(
+        # these can sometimes happen during startup before any tenants have been
+        # loaded, so nothing can be evicted, we just wait for next iteration which
+        # is able to evict.
+        pageserver.allowed_errors.append(".*WARN.* disk usage still high.*")
+
+        pageserver.start(
            overrides=(
                "--pageserver-config-override=disk_usage_based_eviction="
                + enc.dump_inline_table(disk_usage_config).replace("\n", " "),
@@ -133,15 +146,10 @@ class EvictionEnv:
        )

        def statvfs_called():
-            assert self.neon_env.pageserver.log_contains(".*running mocked statvfs.*")
+            assert pageserver.log_contains(".*running mocked statvfs.*")

        wait_until(10, 1, statvfs_called)

-        # these can sometimes happen during startup before any tenants have been
-        # loaded, so nothing can be evicted, we just wait for next iteration which
-        # is able to evict.
-        self.neon_env.pageserver.allowed_errors.append(".*WARN.* disk usage still high.*")
-

 def human_bytes(amt: float) -> str:
    suffixes = ["", "Ki", "Mi", "Gi"]
@@ -156,23 +164,28 @@ def human_bytes(amt: float) -> str:
    raise RuntimeError("unreachable")


-@pytest.fixture
-def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
+def _eviction_env(
+    request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, num_pageservers: int
+) -> EvictionEnv:
    """
    Creates two tenants, one somewhat larger than the other.
    """

    log.info(f"setting up eviction_env for test {request.node.name}")

+    neon_env_builder.num_pageservers = num_pageservers
    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)

    # initial tenant will not be present on this pageserver
    env = neon_env_builder.init_configs()
    env.start()
-    pageserver_http = env.pageserver.http_client()
+
+    # We will create all tenants on the 0th pageserver
+    pageserver_http = env.pageservers[0].http_client()

    # allow because we are invoking this manually; we always warn on executing disk based eviction
-    env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")
+    for ps in env.pageservers:
+        ps.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")

    # Choose small layer_size so that we can use low pgbench_scales and still get a large count of layers.
    # Large count of layers and small layer size is good for testing because it makes evictions predictable.
@@ -197,7 +210,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev

        with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
            pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
-            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id, pageserver_id=1)

        timelines.append((tenant_id, timeline_id))

@@ -233,6 +246,20 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev
    return eviction_env


+@pytest.fixture
+def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
+    return _eviction_env(request, neon_env_builder, pg_bin, num_pageservers=1)
+
+
+@pytest.fixture
+def eviction_env_ha(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
+    """
+    Variant of the eviction environment with two pageservers for testing eviction on
+    HA configurations with a secondary location.
+    """
+    return _eviction_env(request, neon_env_builder, pg_bin, num_pageservers=2)
+
+
 def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
    env = eviction_env

@@ -245,10 +272,10 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
    healthy_tenant_id, healthy_timeline_id = env.timelines[1]

    broken_size_pre, _, _ = poor_mans_du(
-        env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
+        env.neon_env, [(broken_tenant_id, broken_timeline_id)], env.pageserver, verbose=True
    )
    healthy_size_pre, _, _ = poor_mans_du(
-        env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
+        env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], env.pageserver, verbose=True
    )

    # try to evict everything, then validate that broken tenant wasn't touched
@@ -258,10 +285,10 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
    log.info(f"{response}")

    broken_size_post, _, _ = poor_mans_du(
-        env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
+        env.neon_env, [(broken_tenant_id, broken_timeline_id)], env.pageserver, verbose=True
    )
    healthy_size_post, _, _ = poor_mans_du(
-        env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
+        env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], env.pageserver, verbose=True
    )

    assert broken_size_pre == broken_size_post, "broken tenant should not be touched"
@@ -277,14 +304,14 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv)
    env = eviction_env
    pageserver_http = env.pageserver_http

-    (total_on_disk, _, _) = env.timelines_du()
+    (total_on_disk, _, _) = env.timelines_du(env.pageserver)

    target = total_on_disk // 2

    response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
    log.info(f"{response}")

-    (later_total_on_disk, _, _) = env.timelines_du()
+    (later_total_on_disk, _, _) = env.timelines_du(env.pageserver)

    actual_change = total_on_disk - later_total_on_disk

@@ -303,8 +330,8 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv)
    env = eviction_env
    ps_http = env.pageserver_http

-    (total_on_disk, _, _) = env.timelines_du()
-    du_by_timeline = env.du_by_timeline()
+    (total_on_disk, _, _) = env.timelines_du(env.pageserver)
+    du_by_timeline = env.du_by_timeline(env.pageserver)
    log.info("du_by_timeline: %s", du_by_timeline)

    assert len(du_by_timeline) == 2, "this test assumes two tenants"
@@ -344,8 +371,8 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv)
        GLOBAL_LRU_LOG_LINE,
    ), "this test is pointless if it fell back to global LRU"

-    (later_total_on_disk, _, _) = env.timelines_du()
-    later_du_by_timeline = env.du_by_timeline()
+    (later_total_on_disk, _, _) = env.timelines_du(env.pageserver)
+    later_du_by_timeline = env.du_by_timeline(env.pageserver)
    log.info("later_du_by_timeline: %s", later_du_by_timeline)

    actual_change = total_on_disk - later_total_on_disk
@@ -373,13 +400,13 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
    env = eviction_env
    ps_http = env.pageserver_http

-    (total_on_disk, _, _) = env.timelines_du()
+    (total_on_disk, _, _) = env.timelines_du(env.pageserver)
    target = total_on_disk

    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
    log.info(f"{response}")

-    (later_total_on_disk, _, _) = env.timelines_du()
+    (later_total_on_disk, _, _) = env.timelines_du(env.pageserver)
    actual_change = total_on_disk - later_total_on_disk
    assert 0 <= actual_change, "nothing can load layers during this test"
    assert actual_change >= target, "eviction must always evict more than target"
@@ -399,8 +426,8 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
    env = eviction_env
    ps_http = env.pageserver_http

-    (total_on_disk, _, _) = env.timelines_du()
-    du_by_timeline = env.du_by_timeline()
+    (total_on_disk, _, _) = env.timelines_du(env.pageserver)
+    du_by_timeline = env.du_by_timeline(env.pageserver)

    # pick any tenant
    [warm, cold] = list(du_by_timeline.keys())
@@ -416,12 +443,12 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
    log.info(f"{response}")

-    (later_total_on_disk, _, _) = env.timelines_du()
+    (later_total_on_disk, _, _) = env.timelines_du(env.pageserver)
    actual_change = total_on_disk - later_total_on_disk
    assert 0 <= actual_change, "nothing can load layers during this test"
    assert actual_change >= target, "eviction must always evict more than target"

-    later_du_by_timeline = env.du_by_timeline()
+    later_du_by_timeline = env.du_by_timeline(env.pageserver)
    for tenant, later_tenant_usage in later_du_by_timeline.items():
        assert (
            later_tenant_usage < du_by_timeline[tenant]
@@ -453,7 +480,10 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):


 def poor_mans_du(
-    env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]], verbose: bool = False
+    env: NeonEnv,
+    timelines: list[Tuple[TenantId, TimelineId]],
+    pageserver: NeonPageserver,
+    verbose: bool = False,
 ) -> Tuple[int, int, int]:
    """
    Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples;
@@ -463,7 +493,7 @@ def poor_mans_du(
    largest_layer = 0
    smallest_layer = None
    for tenant_id, timeline_id in timelines:
-        timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
+        timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
        assert timeline_dir.exists(), f"timeline dir does not exist: {timeline_dir}"
        total = 0
        for file in timeline_dir.iterdir():
@@ -494,6 +524,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
    env = eviction_env
    env.neon_env.pageserver.stop()
    env.pageserver_start_with_disk_usage_eviction(
+        env.pageserver,
        period="1s",
        max_usage_pct=90,
        min_avail_bytes=0,
@@ -517,11 +548,12 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
    env.neon_env.pageserver.stop()

    # make it seem like we're at 100% utilization by setting total bytes to the used bytes
-    total_size, _, _ = env.timelines_du()
+    total_size, _, _ = env.timelines_du(env.pageserver)
    blocksize = 512
    total_blocks = (total_size + (blocksize - 1)) // blocksize

    env.pageserver_start_with_disk_usage_eviction(
+        env.pageserver,
        period="1s",
        max_usage_pct=33,
        min_avail_bytes=0,
@@ -540,7 +572,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):

    wait_until(10, 1, relieved_log_message)

-    post_eviction_total_size, _, _ = env.timelines_du()
+    post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)

    assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage"

@@ -555,13 +587,14 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
    env.neon_env.pageserver.stop()

    # make it seem like we're at 100% utilization by setting total bytes to the used bytes
-    total_size, _, _ = env.timelines_du()
+    total_size, _, _ = env.timelines_du(env.pageserver)
    blocksize = 512
    total_blocks = (total_size + (blocksize - 1)) // blocksize

    min_avail_bytes = total_size // 3

    env.pageserver_start_with_disk_usage_eviction(
+        env.pageserver,
        period="1s",
        max_usage_pct=100,
        min_avail_bytes=min_avail_bytes,
@@ -580,7 +613,66 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):

    wait_until(10, 1, relieved_log_message)

-    post_eviction_total_size, _, _ = env.timelines_du()
+    post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+
+    assert (
+        total_size - post_eviction_total_size >= min_avail_bytes
+    ), "we requested at least min_avail_bytes worth of free space"
+
+
+def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
+    env = eviction_env_ha
+
+    tenant_ids = [t[0] for t in env.timelines]
+
+    log.info("Setting up secondary location...")
+    ps_attached = env.neon_env.pageservers[0]
+    ps_secondary = env.neon_env.pageservers[1]
+    for tenant_id in tenant_ids:
+        ps_secondary.tenant_location_configure(
+            tenant_id,
+            {
+                "mode": "Secondary",
+                "secondary_conf": {"warm": True},
+                "tenant_conf": {},
+            },
+        )
+        readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
+        log.info(f"Read back conf: {readback_conf}")
+
+        # Request secondary location to download all layers that the attached location has
+        ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+        ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+    # Configure the secondary pageserver to have a phony small disk size
+    ps_secondary.stop()
+    total_size, _, _ = env.timelines_du(ps_secondary)
+    blocksize = 512
+    total_blocks = (total_size + (blocksize - 1)) // blocksize
+
+    min_avail_bytes = total_size // 3
+
+    env.pageserver_start_with_disk_usage_eviction(
+        ps_secondary,
+        period="1s",
+        max_usage_pct=100,
+        min_avail_bytes=min_avail_bytes,
+        mock_behavior={
+            "type": "Success",
+            "blocksize": blocksize,
+            "total_blocks": total_blocks,
+            # Only count layer files towards used bytes in the mock_statvfs.
+            # This avoids accounting for metadata files & tenant conf in the tests.
+            "name_filter": ".*__.*",
+        },
+    )
+
+    def relieved_log_message():
+        assert ps_secondary.log_contains(".*disk usage pressure relieved")
+
+    wait_until(10, 1, relieved_log_message)
+
+    post_eviction_total_size, _, _ = env.timelines_du(ps_secondary)

    assert (
        total_size - post_eviction_total_size >= min_avail_bytes
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -1,9 +1,11 @@
 import random
+from pathlib import Path
 from typing import Any, Dict, Optional

 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
+from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
+from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
@@ -255,6 +257,9 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
        flush_ms=5000,
    )

+    # Encourage the new location to download while still in secondary mode
+    pageserver_b.http_client().tenant_secondary_download(tenant_id)
+
    migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id)
    log.info(f"Acquired generation {migrated_generation} for destination pageserver")
    assert migrated_generation == initial_generation + 1
@@ -262,8 +267,6 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
    # Writes and reads still work in AttachedStale.
    workload.validate(pageserver_a.id)

-    # TODO: call into secondary mode API hooks to do an upload/download sync
-
    # Generate some more dirty writes: we expect the origin to ingest WAL in
    # in AttachedStale
    workload.churn_rows(64, pageserver_a.id, upload=False)
@@ -373,3 +376,143 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
    log.info(f"Read back heatmap: {heatmap_second}")
    assert heatmap_second != heatmap_first
    validate_heatmap(heatmap_second)
+
+
+def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
+    """
+    Inspect local storage on a pageserver to discover which layer files are present.
+
+    :return: list of relative paths to layers, from the timeline root.
+    """
+    timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
+
+    def relative(p: Path) -> Path:
+        return p.relative_to(timeline_path)
+
+    return sorted(
+        list(
+            map(
+                relative,
+                filter(
+                    lambda path: path.name != "metadata"
+                    and "ephemeral" not in path.name
+                    and "temp" not in path.name,
+                    timeline_path.glob("*"),
+                ),
+            )
+        )
+    )
+
+
+def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
+    """
+    Test the overall data flow in secondary mode:
+     - Heatmap uploads from the attached location
+     - Heatmap & layer downloads from the secondary location
+     - Eviction of layers on the attached location results in deletion
+       on the secondary location as well.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    assert env.attachment_service is not None
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    ps_attached = env.pageservers[0]
+    ps_secondary = env.pageservers[1]
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageservers[0].id)
+    workload.write_rows(256, ps_attached.id)
+
+    # Configure a secondary location
+    log.info("Setting up secondary location...")
+    ps_secondary.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "Secondary",
+            "secondary_conf": {"warm": True},
+            "tenant_conf": {},
+        },
+    )
+    readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
+    log.info(f"Read back conf: {readback_conf}")
+
+    # Explicit upload/download cycle
+    # ==============================
+    log.info("Synchronizing after initial write...")
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+
+    ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )
+
+    # Make changes on attached pageserver, check secondary downloads them
+    # ===================================================================
+    log.info("Synchronizing after subsequent write...")
+    workload.churn_rows(128, ps_attached.id)
+
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+    ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )
+
+    # FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while
+    # walreceiver is still doing something.
+    import time
+
+    time.sleep(5)
+
+    # Do evictions on attached pageserver, check secondary follows along
+    # ==================================================================
+    log.info("Evicting a layer...")
+    layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
+    ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)
+
+    log.info("Synchronizing after eviction...")
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+    ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+    assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )
+
+    # Scrub the remote storage
+    # ========================
+    # This confirms that the scrubber isn't upset by the presence of the heatmap
+    S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata()
+
+    # Detach secondary and delete tenant
+    # ===================================
+    # This confirms that the heatmap gets cleaned up as well as other normal content.
+    log.info("Detaching secondary location...")
+    ps_secondary.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+        },
+    )
+
+    log.info("Deleting tenant...")
+    tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
+
+    assert_prefix_empty(
+        neon_env_builder,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
Author	SHA1	Message	Date
John Spray	fab67eed59	Update pageserver/src/tenant/secondary/heatmap_uploader.rs Co-authored-by: Christian Schwarz <christian@neon.tech>	2023-12-20 16:10:59 +00:00
John Spray	a57116e92b	Update pageserver/src/tenant/secondary/downloader.rs Co-authored-by: Christian Schwarz <christian@neon.tech>	2023-12-20 16:09:05 +00:00
John Spray	3cde0e7aad	(downloader.rs): fix a docstring link	2023-12-20 11:02:05 +00:00
John Spray	280c53fa5b	(scheduler) Remove unused trait	2023-12-20 10:39:44 +00:00
John Spray	b7e0b4a866	pageserver: propagate secondary config into downloader	2023-12-20 10:37:03 +00:00
John Spray	c772894c57	pageserver: use warn_when_period_overrun in secondary/heatmap	2023-12-20 10:21:42 +00:00
John Spray	bc1f328d61	tests: add `test_secondary_mode_eviction`	2023-12-20 10:21:31 +00:00
John Spray	1fbc7818a2	tests: add test_secondary_downloads	2023-12-20 10:21:31 +00:00
John Spray	e79b63cbff	tests: do secondary download in live migration test	2023-12-20 10:21:31 +00:00
John Spray	ce5ef5b9bb	Use secondary mode bits in neon_local migration	2023-12-20 10:21:31 +00:00
John Spray	380cadf02b	Use secondary mode bits in neon_local migration	2023-12-20 10:21:31 +00:00
John Spray	807c34b953	pageserver: include secondary tenants in disk usage eviction	2023-12-20 10:21:31 +00:00
John Spray	3f5484f8db	pageserver: pass TenantManager into disk usage eviction task	2023-12-20 10:21:31 +00:00
John Spray	a65707ea92	pageserver: add Layer::for_secondary	2023-12-20 10:21:31 +00:00
John Spray	d7edaea444	pageserver: create timelines/ dir when configuring secondary location	2023-12-20 10:21:31 +00:00
John Spray	201954c12c	pageserver/http: add testing routes for secondary mode	2023-12-20 10:21:31 +00:00
John Spray	69f927a9be	pageserver: TenantManager support for SecondaryTenant	2023-12-20 10:21:31 +00:00
John Spray	e080bc053f	pageserver: add secondary downloader	2023-12-20 10:21:31 +00:00
John Spray	5e38dd2bad	pageserver: add secondary downloader metrics	2023-12-20 10:21:31 +00:00
John Spray	ba1b50efd1	pageserver: refactor generic parts of uploader into scheduler	2023-12-20 10:21:30 +00:00
John Spray	eedf946d90	libs: implement Debug for Gate	2023-12-19 21:36:50 +00:00