pageserver: include secondary tenants in disk usage eviction

pageserver: pass TenantManager into disk usage eviction task
pageserver: add Layer::for_secondary
2026-05-26 09:30:37 +00:00 · 2023-10-26 20:33:47 +01:00 · 2023-10-26 20:33:47 +01:00 · 2023-10-26 20:33:47 +01:00 · 2023-10-26 20:33:47 +01:00 · 2023-10-26 20:33:47 +01:00
79 changed files with 6813 additions and 3491 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -5,4 +5,6 @@ self-hosted-runner:
    - small
    - us-east-2
 config-variables:
+  - REMOTE_STORAGE_AZURE_CONTAINER
+  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -203,6 +203,10 @@ runs:
        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
      run: |
+        if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then
+          exit 0
+        fi
+
        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}

        ./scripts/pysync
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -340,11 +340,11 @@ jobs:

          # Run separate tests for real Azure Blob Storage
          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export REMOTE_STORAGE_AZURE_CONTAINER=neon-github-sandbox
-          export REMOTE_STORAGE_AZURE_REGION=eastus2
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure

@@ -433,7 +433,7 @@ jobs:
          rerun_flaky: true
          pg_version: ${{ matrix.pg_version }}
        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty

      - name: Merge and upload coverage data
@@ -468,7 +468,7 @@ jobs:
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
+          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -847,7 +847,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.18.1
+      VM_BUILDER_VERSION: v0.18.2

    steps:
      - name: Checkout
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -221,8 +221,21 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
            generation: 0,
        });

-    if attach_req.pageserver_id.is_some() {
+    if let Some(attaching_pageserver) = attach_req.pageserver_id.as_ref() {
        tenant_state.generation += 1;
+        tracing::info!(
+            "attach_hook: issuing generation {} to pageserver {}",
+            attaching_pageserver,
+            tenant_state.generation
+        );
+    } else if let Some(ps_id) = tenant_state.pageserver {
+        tracing::info!(
+            "attach_hook: dropping pageserver {} in generation {}",
+            ps_id,
+            tenant_state.generation
+        );
+    } else {
+        tracing::info!("attach_hook: no-op: tenant already has no pageserver");
    }
    tenant_state.pageserver = attach_req.pageserver_id;
    let generation = tenant_state.generation;
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -22,9 +22,9 @@ use postgres_ffi::Oid;
 /// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
 ///
 // FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
-// Then we could replace the custo Ord and PartialOrd implementations below with
-// deriving them.
-#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
+// Then we could replace the custom Ord and PartialOrd implementations below with
+// deriving them. This will require changes in walredoproc.c.
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize)]
 pub struct RelTag {
    pub forknum: u8,
    pub spcnode: Oid,
@@ -40,21 +40,9 @@ impl PartialOrd for RelTag {

 impl Ord for RelTag {
    fn cmp(&self, other: &Self) -> Ordering {
-        let mut cmp = self.spcnode.cmp(&other.spcnode);
-        if cmp != Ordering::Equal {
-            return cmp;
-        }
-        cmp = self.dbnode.cmp(&other.dbnode);
-        if cmp != Ordering::Equal {
-            return cmp;
-        }
-        cmp = self.relnode.cmp(&other.relnode);
-        if cmp != Ordering::Equal {
-            return cmp;
-        }
-        cmp = self.forknum.cmp(&other.forknum);
-
-        cmp
+        // Custom ordering where we put forknum to the end of the list
+        let other_tup = (other.spcnode, other.dbnode, other.relnode, other.forknum);
+        (self.spcnode, self.dbnode, self.relnode, self.forknum).cmp(&other_tup)
    }
 }

--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -73,6 +73,8 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

+pub mod sync;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -0,0 +1 @@
+pub mod heavier_once_cell;
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -0,0 +1,306 @@
+use std::sync::{Arc, Mutex, MutexGuard};
+use tokio::sync::Semaphore;
+
+/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
+/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
+/// for the duration of initialization.
+///
+/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
+///
+/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
+pub struct OnceCell<T> {
+    inner: Mutex<Inner<T>>,
+}
+
+impl<T> Default for OnceCell<T> {
+    /// Create new uninitialized [`OnceCell`].
+    fn default() -> Self {
+        Self {
+            inner: Default::default(),
+        }
+    }
+}
+
+/// Semaphore is the current state:
+/// - open semaphore means the value is `None`, not yet initialized
+/// - closed semaphore means the value has been initialized
+#[derive(Debug)]
+struct Inner<T> {
+    init_semaphore: Arc<Semaphore>,
+    value: Option<T>,
+}
+
+impl<T> Default for Inner<T> {
+    fn default() -> Self {
+        Self {
+            init_semaphore: Arc::new(Semaphore::new(1)),
+            value: None,
+        }
+    }
+}
+
+impl<T> OnceCell<T> {
+    /// Creates an already initialized `OnceCell` with the given value.
+    pub fn new(value: T) -> Self {
+        let sem = Semaphore::new(1);
+        sem.close();
+        Self {
+            inner: Mutex::new(Inner {
+                init_semaphore: Arc::new(sem),
+                value: Some(value),
+            }),
+        }
+    }
+
+    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
+    /// returning the guard.
+    ///
+    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
+    ///
+    /// Initialization is panic-safe and cancellation-safe.
+    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
+    where
+        F: FnOnce() -> Fut,
+        Fut: std::future::Future<Output = Result<T, E>>,
+    {
+        let sem = {
+            let guard = self.inner.lock().unwrap();
+            if guard.value.is_some() {
+                return Ok(Guard(guard));
+            }
+            guard.init_semaphore.clone()
+        };
+
+        let permit = sem.acquire_owned().await;
+        if permit.is_err() {
+            let guard = self.inner.lock().unwrap();
+            assert!(
+                guard.value.is_some(),
+                "semaphore got closed, must be initialized"
+            );
+            return Ok(Guard(guard));
+        } else {
+            // now we try
+            let value = factory().await?;
+
+            let mut guard = self.inner.lock().unwrap();
+            assert!(
+                guard.value.is_none(),
+                "we won permit, must not be initialized"
+            );
+            guard.value = Some(value);
+            guard.init_semaphore.close();
+            Ok(Guard(guard))
+        }
+    }
+
+    /// Returns a guard to an existing initialized value, if any.
+    pub fn get(&self) -> Option<Guard<'_, T>> {
+        let guard = self.inner.lock().unwrap();
+        if guard.value.is_some() {
+            Some(Guard(guard))
+        } else {
+            None
+        }
+    }
+}
+
+/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
+/// initialized value.
+#[derive(Debug)]
+pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
+
+impl<T> std::ops::Deref for Guard<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.0
+            .value
+            .as_ref()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<T> std::ops::DerefMut for Guard<'_, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.0
+            .value
+            .as_mut()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<'a, T> Guard<'a, T> {
+    /// Take the current value, and a new permit for it's deinitialization.
+    ///
+    /// The permit will be on a semaphore part of the new internal value, and any following
+    /// [`OnceCell::get_or_init`] will wait on it to complete.
+    pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
+        let mut swapped = Inner::default();
+        let permit = swapped
+            .init_semaphore
+            .clone()
+            .try_acquire_owned()
+            .expect("we just created this");
+        std::mem::swap(&mut *self.0, &mut swapped);
+        swapped
+            .value
+            .map(|v| (v, permit))
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::{
+        convert::Infallible,
+        sync::atomic::{AtomicUsize, Ordering},
+        time::Duration,
+    };
+
+    #[tokio::test]
+    async fn many_initializers() {
+        #[derive(Default, Debug)]
+        struct Counters {
+            factory_got_to_run: AtomicUsize,
+            future_polled: AtomicUsize,
+            winners: AtomicUsize,
+        }
+
+        let initializers = 100;
+
+        let cell = Arc::new(OnceCell::default());
+        let counters = Arc::new(Counters::default());
+        let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1));
+
+        let mut js = tokio::task::JoinSet::new();
+        for i in 0..initializers {
+            js.spawn({
+                let cell = cell.clone();
+                let counters = counters.clone();
+                let barrier = barrier.clone();
+
+                async move {
+                    barrier.wait().await;
+                    let won = {
+                        let g = cell
+                            .get_or_init(|| {
+                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
+                                async {
+                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
+                                    Ok::<_, Infallible>(i)
+                                }
+                            })
+                            .await
+                            .unwrap();
+
+                        *g == i
+                    };
+
+                    if won {
+                        counters.winners.fetch_add(1, Ordering::Relaxed);
+                    }
+                }
+            });
+        }
+
+        barrier.wait().await;
+
+        while let Some(next) = js.join_next().await {
+            next.expect("no panics expected");
+        }
+
+        let mut counters = Arc::try_unwrap(counters).unwrap();
+
+        assert_eq!(*counters.factory_got_to_run.get_mut(), 1);
+        assert_eq!(*counters.future_polled.get_mut(), 1);
+        assert_eq!(*counters.winners.get_mut(), 1);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn reinit_waits_for_deinit() {
+        // with the tokio::time paused, we will "sleep" for 1s while holding the reinitialization
+        let sleep_for = Duration::from_secs(1);
+        let initial = 42;
+        let reinit = 1;
+        let cell = Arc::new(OnceCell::new(initial));
+
+        let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2));
+
+        let jh = tokio::spawn({
+            let cell = cell.clone();
+            let deinitialization_started = deinitialization_started.clone();
+            async move {
+                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
+                assert_eq!(answer, initial);
+
+                deinitialization_started.wait().await;
+                tokio::time::sleep(sleep_for).await;
+            }
+        });
+
+        deinitialization_started.wait().await;
+
+        let started_at = tokio::time::Instant::now();
+        cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
+            .await
+            .unwrap();
+
+        let elapsed = started_at.elapsed();
+        assert!(
+            elapsed >= sleep_for,
+            "initialization should had taken at least the time time slept with permit"
+        );
+
+        jh.await.unwrap();
+
+        assert_eq!(*cell.get().unwrap(), reinit);
+    }
+
+    #[tokio::test]
+    async fn initialization_attemptable_until_ok() {
+        let cell = OnceCell::default();
+
+        for _ in 0..10 {
+            cell.get_or_init(|| async { Err("whatever error") })
+                .await
+                .unwrap_err();
+        }
+
+        let g = cell
+            .get_or_init(|| async { Ok::<_, Infallible>("finally success") })
+            .await
+            .unwrap();
+        assert_eq!(*g, "finally success");
+    }
+
+    #[tokio::test]
+    async fn initialization_is_cancellation_safe() {
+        let cell = OnceCell::default();
+
+        let barrier = tokio::sync::Barrier::new(2);
+
+        let initializer = cell.get_or_init(|| async {
+            barrier.wait().await;
+            futures::future::pending::<()>().await;
+
+            Ok::<_, Infallible>("never reached")
+        });
+
+        tokio::select! {
+            _ = initializer => { unreachable!("cannot complete; stuck in pending().await") },
+            _ = barrier.wait() => {}
+        };
+
+        // now initializer is dropped
+
+        assert!(cell.get().is_none());
+
+        let g = cell
+            .get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
+            .await
+            .unwrap();
+        assert_eq!(*g, "now initialized");
+    }
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -14,7 +14,7 @@ use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
-use pageserver::tenant::TenantSharedResources;
+use pageserver::tenant::{secondary, TenantSharedResources};
 use remote_storage::GenericRemoteStorage;
 use tokio::time::Instant;
 use tracing::*;
@@ -408,7 +408,7 @@ fn start_pageserver(

    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
-    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
@@ -418,6 +418,7 @@ fn start_pageserver(
        order,
        shutdown_pageserver.clone(),
    ))?;
+    let tenant_manager = Arc::new(tenant_manager);

    BACKGROUND_RUNTIME.spawn({
        let init_done_rx = init_done_rx;
@@ -523,6 +524,18 @@ fn start_pageserver(
        }
    });

+    let secondary_controller = if let Some(remote_storage) = &remote_storage {
+        secondary::spawn_tasks(
+            conf,
+            tenant_manager.clone(),
+            remote_storage.clone(),
+            background_jobs_barrier.clone(),
+            shutdown_pageserver.clone(),
+        )
+    } else {
+        secondary::null_controller()
+    };
+
    // shared state between the disk-usage backed eviction background task and the http endpoint
    // that allows triggering disk-usage based eviction manually. note that the http endpoint
    // is still accessible even if background task is not configured as long as remote storage has
@@ -534,6 +547,7 @@ fn start_pageserver(
            conf,
            remote_storage.clone(),
            disk_usage_eviction_state.clone(),
+            tenant_manager.clone(),
            background_jobs_barrier.clone(),
        )?;
    }
@@ -546,11 +560,13 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
+                tenant_manager,
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
                disk_usage_eviction_state,
                deletion_queue.new_client(),
+                secondary_controller,
            )
            .context("Failed to initialize router state")?,
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -266,7 +266,7 @@ async fn calculate_synthetic_size_worker(
                continue;
            }

-            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
+            if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
                // We can put in some prioritization for consumption metrics.
                // Same for the loop that fetches computed metrics.
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -206,7 +206,6 @@ pub(super) async fn collect_all_metrics(
            None
        } else {
            crate::tenant::mgr::get_tenant(id, true)
-                .await
                .ok()
                .map(|tenant| (id, tenant))
        }
--- a/pageserver/src/deletion_queue/check.log
+++ b/pageserver/src/deletion_queue/check.log
@@ -0,0 +1,2 @@
+    Checking pageserver v0.1.0 (/home/neon/neon/pageserver)
+    Finished dev [optimized + debuginfo] target(s) in 7.62s
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -48,19 +48,26 @@ use std::{
 };

 use anyhow::Context;
-use camino::Utf8Path;
 use remote_storage::GenericRemoteStorage;
 use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
-use utils::completion;
-use utils::serde_percent::Percent;
+use utils::{
+    completion,
+    id::{TenantId, TenantTimelineId},
+};
+use utils::{id::TimelineId, serde_percent::Percent};

 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
+    tenant::{
+        mgr::TenantManager,
+        secondary::SecondaryTenant,
+        storage_layer::{AsLayerDesc, EvictionError, Layer},
+        Timeline,
+    },
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -83,6 +90,7 @@ pub fn launch_disk_usage_global_eviction_task(
    conf: &'static PageServerConf,
    storage: GenericRemoteStorage,
    state: Arc<State>,
+    tenant_manager: Arc<TenantManager>,
    background_jobs_barrier: completion::Barrier,
 ) -> anyhow::Result<()> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
@@ -108,8 +116,7 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };

-            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
-                .await;
+            disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
            Ok(())
        },
    );
@@ -121,8 +128,8 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: GenericRemoteStorage,
-    tenants_dir: &Utf8Path,
+    _storage: &GenericRemoteStorage,
+    tenant_manager: Arc<TenantManager>,
    cancel: CancellationToken,
 ) {
    scopeguard::defer! {
@@ -145,14 +152,9 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();

        async {
-            let res = disk_usage_eviction_task_iteration(
-                state,
-                task_config,
-                &storage,
-                tenants_dir,
-                &cancel,
-            )
-            .await;
+            let res =
+                disk_usage_eviction_task_iteration(state, task_config, &tenant_manager, &cancel)
+                    .await;

            match res {
                Ok(()) => {}
@@ -183,13 +185,14 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: &GenericRemoteStorage,
-    tenants_dir: &Utf8Path,
+    tenant_manager: &Arc<TenantManager>,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
-    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
+    let tenants_dir = tenant_manager.get_conf().tenants_path();
+    let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
+    let res =
+        disk_usage_eviction_task_iteration_impl(state, usage_pre, tenant_manager, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -199,7 +202,7 @@ async fn disk_usage_eviction_task_iteration(
                }
                IterationOutcome::Finished(outcome) => {
                    // Verify with statvfs whether we made any real progress
-                    let after = filesystem_level_usage::get(tenants_dir, task_config)
+                    let after = filesystem_level_usage::get(&tenants_dir, task_config)
                        // It's quite unlikely to hit the error here. Keep the code simple and bail out.
                        .context("get filesystem-level disk usage after evictions")?;

@@ -273,8 +276,8 @@ struct LayerCount {

 pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
-    storage: &GenericRemoteStorage,
    usage_pre: U,
+    tenant_manager: &Arc<TenantManager>,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
    // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
@@ -294,7 +297,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        "running disk usage based eviction due to pressure"
    );

-    let candidates = match collect_eviction_candidates(cancel).await? {
+    let candidates = match collect_eviction_candidates(tenant_manager, cancel).await? {
        EvictionCandidates::Cancelled => {
            return Ok(IterationOutcome::Cancelled);
        }
@@ -330,9 +333,16 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
+
+    // Evictions for attached tenants, batched by timeline
+    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
+
+    // Evictions for secondary locations, batched by tenant
+    let mut secondary_by_tenant: HashMap<TenantId, Vec<(TimelineId, Layer)>> = HashMap::new();
+
    let mut warned = None;
    let mut usage_planned = usage_pre;
+    let mut max_batch_size = 0;
    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
        if !usage_planned.has_pressure() {
            debug!(
@@ -349,10 +359,26 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);

-        batched
-            .entry(TimelineKey(candidate.timeline))
-            .or_default()
-            .push(candidate.layer);
+        // FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
+        // tasks to evict all seen layers until we have evicted enough
+
+        match candidate.source {
+            EvictionCandidateSource::Attached(timeline) => {
+                let batch = batched.entry(TimelineKey(timeline)).or_default();
+
+                // semaphore will later be used to limit eviction concurrency, and we can express at
+                // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
+                // but fail gracefully by not making batches larger.
+                if batch.len() < u32::MAX as usize {
+                    batch.push(candidate.layer);
+                    max_batch_size = max_batch_size.max(batch.len());
+                }
+            }
+            EvictionCandidateSource::Secondary(ttid) => {
+                let batch = secondary_by_tenant.entry(ttid.tenant_id).or_default();
+                batch.push((ttid.timeline_id, candidate.layer));
+            }
+        }
    }

    let usage_planned = match warned {
@@ -367,71 +393,116 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    };
    debug!(?usage_planned, "usage planned");

-    // phase2: evict victims batched by timeline
+    // phase2 (secondary tenants): evict victims batched by tenant
+    for (tenant_id, timeline_layers) in secondary_by_tenant {
+        // Q: Why do we go via TenantManager again rather than just deleting files, or keeping
+        // an Arc ref to the secondary state?
+        // A: It's because a given tenant's local storage **belongs** to whoever is currently
+        // live in the TenantManager.  We must avoid a race where we might plan an eviction
+        // for secondary, and then execute it when the tenant is actually in an attached state.
+        tenant_manager
+            .evict_tenant_layers(&tenant_id, timeline_layers)
+            .instrument(tracing::info_span!("evict_batch", %tenant_id))
+            .await;
+    }
+
+    // phase2 (attached tenants): evict victims batched by timeline
+
+    let mut js = tokio::task::JoinSet::new();
+
+    // ratelimit to 1k files or any higher max batch size
+    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));

-    // After the loop, `usage_assumed` is the post-eviction usage,
-    // according to internal accounting.
-    let mut usage_assumed = usage_pre;
-    let mut evictions_failed = LayerCount::default();
    for (timeline, batch) in batched {
        let tenant_id = timeline.tenant_id;
        let timeline_id = timeline.timeline_id;
-        let batch_size = batch.len();
+        let batch_size =
+            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
+
+        // I dislike naming of `available_permits` but it means current total amount of permits
+        // because permits can be added
+        assert!(batch_size as usize <= limit.available_permits());

        debug!(%timeline_id, "evicting batch for timeline");

-        async {
-            let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
+        let evict = {
+            let limit = limit.clone();
+            let cancel = cancel.clone();
+            async move {
+                let mut evicted_bytes = 0;
+                let mut evictions_failed = LayerCount::default();

-            match results {
-                Err(e) => {
-                    warn!("failed to evict batch: {:#}", e);
-                }
-                Ok(results) => {
-                    assert_eq!(results.len(), batch.len());
-                    for (result, layer) in results.into_iter().zip(batch.iter()) {
-                        let file_size = layer.layer_desc().file_size;
-                        match result {
-                            Some(Ok(())) => {
-                                usage_assumed.add_available_bytes(file_size);
-                            }
-                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
-                            }
-                            Some(Err(EvictionError::FileNotFound)) => {
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            Some(Err(
-                                e @ EvictionError::LayerNotFound(_)
-                                | e @ EvictionError::StatFailed(_),
-                            )) => {
-                                let e = utils::error::report_compact_sources(&e);
-                                warn!(%layer, "failed to evict layer: {e}");
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            Some(Err(EvictionError::MetadataInconsistency(detail))) => {
-                                warn!(%layer, "failed to evict layer: {detail}");
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            None => {
-                                assert!(cancel.is_cancelled());
-                                return;
+                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
+                    // semaphore closing means cancelled
+                    return (evicted_bytes, evictions_failed);
+                };
+
+                let results = timeline.evict_layers(&batch, &cancel).await;
+
+                match results {
+                    Ok(results) => {
+                        assert_eq!(results.len(), batch.len());
+                        for (result, layer) in results.into_iter().zip(batch.iter()) {
+                            let file_size = layer.layer_desc().file_size;
+                            match result {
+                                Some(Ok(())) => {
+                                    evicted_bytes += file_size;
+                                }
+                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                                    evictions_failed.file_sizes += file_size;
+                                    evictions_failed.count += 1;
+                                }
+                                None => {
+                                    assert!(cancel.is_cancelled());
+                                }
                            }
                        }
                    }
+                    Err(e) => {
+                        warn!("failed to evict batch: {:#}", e);
+                    }
                }
+                (evicted_bytes, evictions_failed)
            }
        }
-        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
-        .await;
+        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));

-        if cancel.is_cancelled() {
+        js.spawn(evict);
+
+        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
+        // chance of making progress
+        tokio::task::yield_now().await;
+    }
+
+    let join_all = async move {
+        // After the evictions, `usage_assumed` is the post-eviction usage,
+        // according to internal accounting.
+        let mut usage_assumed = usage_pre;
+        let mut evictions_failed = LayerCount::default();
+
+        while let Some(res) = js.join_next().await {
+            match res {
+                Ok((evicted_bytes, failed)) => {
+                    usage_assumed.add_available_bytes(evicted_bytes);
+                    evictions_failed.file_sizes += failed.file_sizes;
+                    evictions_failed.count += failed.count;
+                }
+                Err(je) if je.is_cancelled() => unreachable!("not used"),
+                Err(je) if je.is_panic() => { /* already logged */ }
+                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
+            }
+        }
+        (usage_assumed, evictions_failed)
+    };
+
+    let (usage_assumed, evictions_failed) = tokio::select! {
+        tuple = join_all => { tuple },
+        _ = cancel.cancelled() => {
+            // close the semaphore to stop any pending acquires
+            limit.close();
            return Ok(IterationOutcome::Cancelled);
        }
-    }
+    };

    Ok(IterationOutcome::Finished(IterationOutcomeFinished {
        before: usage_pre,
@@ -443,10 +514,19 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    }))
 }

+// An eviction candidate might originate from either an attached tenant
+// with a [`Tenant`] and [`Timeline`] object, or from a secondary tenant
+// location.  These differ in how we will execute the eviction.
+#[derive(Clone)]
+enum EvictionCandidateSource {
+    Attached(Arc<Timeline>),
+    Secondary(TenantTimelineId),
+}
+
 #[derive(Clone)]
 struct EvictionCandidate {
-    timeline: Arc<Timeline>,
-    layer: Arc<dyn PersistentLayer>,
+    source: EvictionCandidateSource,
+    layer: Layer,
    last_activity_ts: SystemTime,
 }

@@ -495,27 +575,18 @@ enum EvictionCandidates {
 /// after exhauting the `Above` partition.
 /// So, we did not respect each tenant's min_resident_size.
 async fn collect_eviction_candidates(
+    tenant_manager: &Arc<TenantManager>,
    cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
    // get a snapshot of the list of tenants
-    let tenants = tenant::mgr::list_tenants()
-        .await
-        .context("get list of tenants")?;
-
    let mut candidates = Vec::new();

-    for (tenant_id, _state) in &tenants {
+    let tenants = tenant_manager.get_attached_tenants();
+
+    for tenant in tenants {
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
-            Ok(tenant) => tenant,
-            Err(e) => {
-                // this can happen if tenant has lifecycle transition after we fetched it
-                debug!("failed to get tenant: {e:#}");
-                continue;
-            }
-        };

        // collect layers from all timelines in this tenant
        //
@@ -578,7 +649,7 @@ async fn collect_eviction_candidates(
        for (timeline, layer_info) in tenant_candidates.into_iter() {
            let file_size = layer_info.file_size();
            let candidate = EvictionCandidate {
-                timeline,
+                source: EvictionCandidateSource::Attached(timeline),
                last_activity_ts: layer_info.last_activity_ts,
                layer: layer_info.layer,
            };
@@ -592,6 +663,43 @@ async fn collect_eviction_candidates(
        }
    }

+    // FIXME: this is a long loop over all secondary locations.  At the least, respect
+    // cancellation here, but really we need to break up the loop.  We could extract the
+    // Arc<SecondaryTenant>s and iterate over them with some tokio yields in there.  Ideally
+    // though we should just reduce the total amount of work: our eviction goals do not require
+    // listing absolutely every layer in every tenant: we could sample this.
+    tenant_manager.foreach_secondary_tenants(
+        |tenant_id: &TenantId, state: &Arc<SecondaryTenant>| {
+        let mut tenant_candidates = Vec::new();
+        for (timeline_id, layer_info) in state.get_layers_for_eviction() {
+            debug!(tenant_id=%tenant_id, timeline_id=%timeline_id, "timeline resident layers (secondary) count: {}", layer_info.resident_layers.len());
+            tenant_candidates.extend(
+                layer_info.resident_layers
+                    .into_iter()
+                    .map(|layer_infos| (timeline_id, layer_infos)),
+            );
+        }
+
+        tenant_candidates
+            .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
+
+        candidates.extend(tenant_candidates.into_iter().map(|(timeline_id, candidate)| {
+            (
+                // Secondary locations' layers are always considered above the min resident size,
+                // i.e. secondary locations are permitted to be trimmed to zero layers if all
+                // the layers have sufficiently old access times.
+                MinResidentSizePartition::Above,
+                    EvictionCandidate {
+                    source: EvictionCandidateSource::Secondary(TenantTimelineId { tenant_id: *tenant_id, timeline_id}),
+                    last_activity_ts: candidate.last_activity_ts,
+                    layer: candidate.layer,
+                }
+            )
+        }));
+
+        },
+    );
+
    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
    candidates
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -392,13 +392,19 @@ paths:
            type: string
            format: date-time
          description: A timestamp to get the LSN
+        - name: version
+          in: query
+          required: false
+          schema:
+            type: integer
+          description: The version of the endpoint to use
      responses:
        "200":
          description: OK
          content:
            application/json:
              schema:
-                type: string
+                $ref: "#/components/schemas/LsnByTimestampResponse"
        "400":
          description: Error when no tenant id found in path, no timeline id or invalid timestamp
          content:
@@ -1384,6 +1390,19 @@ components:
          type: string
          format: hex

+    LsnByTimestampResponse:
+      type: object
+      required:
+        - lsn
+        - kind
+      properties:
+        lsn:
+          type: string
+          format: hex
+        kind:
+          type: string
+          enum: [past, present, future, nodata]
+
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -4,11 +4,12 @@
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::Arc;
+use std::time::Duration;

 use anyhow::{anyhow, Context, Result};
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
-use hyper::header::CONTENT_TYPE;
+use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -17,6 +18,7 @@ use pageserver_api::models::{
    TenantLoadRequest, TenantLocationConfigRequest,
 };
 use remote_storage::GenericRemoteStorage;
+use serde_with::{serde_as, DisplayFromStr};
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -35,8 +37,10 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
+    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
+    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
+use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::timeline::Timeline;
@@ -62,22 +66,27 @@ use super::models::ConfigureFailpointsRequest;

 pub struct State {
    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
    auth: Option<Arc<JwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
    broker_client: storage_broker::BrokerClientChannel,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
    deletion_queue_client: DeletionQueueClient,
+    secondary_controller: SecondaryController,
 }

 impl State {
+    #[allow(clippy::too_many_arguments)]
    pub fn new(
        conf: &'static PageServerConf,
+        tenant_manager: Arc<TenantManager>,
        auth: Option<Arc<JwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
        deletion_queue_client: DeletionQueueClient,
+        secondary_controller: SecondaryController,
    ) -> anyhow::Result<Self> {
        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
            .iter()
@@ -85,12 +94,14 @@ impl State {
            .collect::<Vec<_>>();
        Ok(Self {
            conf,
+            tenant_manager,
            auth,
            allowlist_routes,
            remote_storage,
            broker_client,
            disk_usage_eviction_state,
            deletion_queue_client,
+            secondary_controller,
        })
    }

@@ -146,28 +157,60 @@ impl From<PageReconstructError> for ApiError {
 impl From<TenantMapInsertError> for ApiError {
    fn from(tmie: TenantMapInsertError) -> ApiError {
        match tmie {
-            TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
-                ApiError::ResourceUnavailable(format!("{tmie}").into())
-            }
-            TenantMapInsertError::TenantAlreadyExists(id, state) => {
-                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
-            }
-            TenantMapInsertError::TenantExistsSecondary(id) => {
-                ApiError::Conflict(format!("tenant {id} already exists as secondary"))
-            }
+            TenantMapInsertError::SlotError(e) => e.into(),
+            TenantMapInsertError::SlotUpsertError(e) => e.into(),
            TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
        }
    }
 }

+impl From<TenantSlotError> for ApiError {
+    fn from(e: TenantSlotError) -> ApiError {
+        use TenantSlotError::*;
+        match e {
+            NotFound(tenant_id) => {
+                ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
+            }
+            e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
+            e @ Conflict(_) => ApiError::Conflict(format!("{e}")),
+            InProgress => {
+                ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
+            }
+            MapState(e) => e.into(),
+        }
+    }
+}
+
+impl From<TenantSlotUpsertError> for ApiError {
+    fn from(e: TenantSlotUpsertError) -> ApiError {
+        use TenantSlotUpsertError::*;
+        match e {
+            InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
+            MapState(e) => e.into(),
+        }
+    }
+}
+
+impl From<TenantMapError> for ApiError {
+    fn from(e: TenantMapError) -> ApiError {
+        use TenantMapError::*;
+        match e {
+            StillInitializing | ShuttingDown => {
+                ApiError::ResourceUnavailable(format!("{e}").into())
+            }
+        }
+    }
+}
+
 impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
-            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
            TenantStateError::IsStopping(_) => {
                ApiError::ResourceUnavailable("Tenant is stopping".into())
            }
-            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
+            TenantStateError::SlotError(e) => e.into(),
+            TenantStateError::SlotUpsertError(e) => e.into(),
+            TenantStateError::Other(e) => ApiError::InternalServerError(anyhow!(e)),
        }
    }
 }
@@ -242,6 +285,9 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
            Get(g) => ApiError::from(g),
            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
            Timeline(t) => ApiError::from(t),
+            NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
+            SlotError(e) => e.into(),
+            SlotUpsertError(e) => e.into(),
            Other(o) => ApiError::InternalServerError(o),
            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
        }
@@ -368,7 +414,7 @@ async fn timeline_create_handler(
    let state = get_state(&request);

    async {
-        let tenant = mgr::get_tenant(tenant_id, true).await?;
+        let tenant = mgr::get_tenant(tenant_id, true)?;
        match tenant.create_timeline(
            new_timeline_id,
            request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -415,7 +461,7 @@ async fn timeline_list_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let response_data = async {
-        let tenant = mgr::get_tenant(tenant_id, true).await?;
+        let tenant = mgr::get_tenant(tenant_id, true)?;
        let timelines = tenant.list_timelines();

        let mut response_data = Vec::with_capacity(timelines.len());
@@ -454,7 +500,7 @@ async fn timeline_detail_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_id, true).await?;
+        let tenant = mgr::get_tenant(tenant_id, true)?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -484,6 +530,8 @@ async fn get_lsn_by_timestamp_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

+    let version: Option<u8> = parse_query_param(&request, "version")?;
+
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let timestamp_raw = must_get_query_param(&request, "timestamp")?;
    let timestamp = humantime::parse_rfc3339(&timestamp_raw)
@@ -495,13 +543,32 @@ async fn get_lsn_by_timestamp_handler(
    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
    let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;

-    let result = match result {
-        LsnForTimestamp::Present(lsn) => format!("{lsn}"),
-        LsnForTimestamp::Future(_lsn) => "future".into(),
-        LsnForTimestamp::Past(_lsn) => "past".into(),
-        LsnForTimestamp::NoData(_lsn) => "nodata".into(),
-    };
-    json_response(StatusCode::OK, result)
+    if version.unwrap_or(0) > 1 {
+        #[serde_as]
+        #[derive(serde::Serialize)]
+        struct Result {
+            #[serde_as(as = "DisplayFromStr")]
+            lsn: Lsn,
+            kind: &'static str,
+        }
+        let (lsn, kind) = match result {
+            LsnForTimestamp::Present(lsn) => (lsn, "present"),
+            LsnForTimestamp::Future(lsn) => (lsn, "future"),
+            LsnForTimestamp::Past(lsn) => (lsn, "past"),
+            LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
+        };
+        json_response(StatusCode::OK, Result { lsn, kind })
+    } else {
+        // FIXME: this is a temporary crutch not to break backwards compatibility
+        // See https://github.com/neondatabase/neon/pull/5608
+        let result = match result {
+            LsnForTimestamp::Present(lsn) => format!("{lsn}"),
+            LsnForTimestamp::Future(_lsn) => "future".into(),
+            LsnForTimestamp::Past(_lsn) => "past".into(),
+            LsnForTimestamp::NoData(_lsn) => "nodata".into(),
+        };
+        json_response(StatusCode::OK, result)
+    }
 }

 async fn get_timestamp_of_lsn_handler(
@@ -691,7 +758,7 @@ async fn tenant_status(
    check_permission(&request, Some(tenant_id))?;

    let tenant_info = async {
-        let tenant = mgr::get_tenant(tenant_id, false).await?;
+        let tenant = mgr::get_tenant(tenant_id, false)?;

        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -754,7 +821,7 @@ async fn tenant_size_handler(
    let headers = request.headers();

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_id, true).await?;
+    let tenant = mgr::get_tenant(tenant_id, true)?;

    // this can be long operation
    let inputs = tenant
@@ -767,6 +834,10 @@ async fn tenant_size_handler(
        .map_err(ApiError::InternalServerError)?;

    let mut sizes = None;
+    let accepts_html = headers
+        .get(header::ACCEPT)
+        .map(|v| v == "text/html")
+        .unwrap_or_default();
    if !inputs_only.unwrap_or(false) {
        let storage_model = inputs
            .calculate_model()
@@ -774,11 +845,11 @@ async fn tenant_size_handler(
        let size = storage_model.calculate();

        // If request header expects html, return html
-        if headers["Accept"] == "text/html" {
+        if accepts_html {
            return synthetic_size_html_response(inputs, storage_model, size);
        }
        sizes = Some(size);
-    } else if headers["Accept"] == "text/html" {
+    } else if accepts_html {
        return Err(ApiError::BadRequest(anyhow!(
            "inputs_only parameter is incompatible with html output request"
        )));
@@ -929,7 +1000,7 @@ fn synthetic_size_html_response(
 pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
    let response = Response::builder()
        .status(status)
-        .header(hyper::header::CONTENT_TYPE, "text/html")
+        .header(header::CONTENT_TYPE, "text/html")
        .body(Body::from(data.as_bytes().to_vec()))
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
    Ok(response)
@@ -1009,7 +1080,7 @@ async fn get_tenant_config_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = mgr::get_tenant(tenant_id, false).await?;
+    let tenant = mgr::get_tenant(tenant_id, false)?;

    let response = HashMap::from([
        (
@@ -1053,6 +1124,9 @@ async fn put_tenant_location_config_handler(
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
+
+    let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
+
    let tenant_id = request_data.tenant_id;
    check_permission(&request, Some(tenant_id))?;

@@ -1068,7 +1142,7 @@ async fn put_tenant_location_config_handler(
            .await
        {
            match e {
-                TenantStateError::NotFound(_) => {
+                TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
                    // This API is idempotent: a NotFound on a detach is fine.
                }
                _ => return Err(e.into()),
@@ -1080,20 +1154,14 @@ async fn put_tenant_location_config_handler(
    let location_conf =
        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    mgr::upsert_location(
-        state.conf,
-        tenant_id,
-        location_conf,
-        state.broker_client.clone(),
-        state.remote_storage.clone(),
-        state.deletion_queue_client.clone(),
-        &ctx,
-    )
-    .await
-    // TODO: badrequest assumes the caller was asking for something unreasonable, but in
-    // principle we might have hit something like concurrent API calls to the same tenant,
-    // which is not a 400 but a 409.
-    .map_err(ApiError::BadRequest)?;
+    state
+        .tenant_manager
+        .upsert_location(tenant_id, location_conf, flush, &ctx)
+        .await
+        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
+        // principle we might have hit something like concurrent API calls to the same tenant,
+        // which is not a 400 but a 409.
+        .map_err(ApiError::BadRequest)?;

    json_response(StatusCode::OK, ())
 }
@@ -1106,7 +1174,6 @@ async fn handle_tenant_break(
    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;

    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
-        .await
        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;

    tenant.set_broken("broken from test".to_owned()).await;
@@ -1179,7 +1246,7 @@ async fn timeline_compact_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
@@ -1204,7 +1271,7 @@ async fn timeline_checkpoint_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        json_response(StatusCode::OK, ())
    }
@@ -1310,7 +1377,7 @@ async fn getpage_at_lsn_handler(
        Result::<_, ApiError>::Ok(
            Response::builder()
                .status(StatusCode::OK)
-                .header(CONTENT_TYPE, "application/octet-stream")
+                .header(header::CONTENT_TYPE, "application/octet-stream")
                .body(hyper::Body::from(page))
                .unwrap(),
        )
@@ -1411,7 +1478,7 @@ async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_id, true).await?;
+    let tenant = mgr::get_tenant(tenant_id, true)?;
    tenant
        .get_timeline(timeline_id, true)
        .map_err(|e| ApiError::NotFound(e.into()))
@@ -1474,17 +1541,18 @@ async fn disk_usage_eviction_run(

    let state = get_state(&r);

-    let Some(storage) = state.remote_storage.clone() else {
+    if state.remote_storage.as_ref().is_none() {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
        )));
-    };
+    }

-    let state = state.disk_usage_eviction_state.clone();
+    let eviction_state = state.disk_usage_eviction_state.clone();

    let cancel = CancellationToken::new();
    let child_cancel = cancel.clone();
    let _g = cancel.drop_guard();
+    let tenant_manager = state.tenant_manager.clone();

    crate::task_mgr::spawn(
        crate::task_mgr::BACKGROUND_RUNTIME.handle(),
@@ -1495,9 +1563,9 @@ async fn disk_usage_eviction_run(
        false,
        async move {
            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
-                &state,
-                &storage,
+                &eviction_state,
                usage,
+                &tenant_manager,
                &child_cancel,
            )
            .await;
@@ -1515,6 +1583,36 @@ async fn disk_usage_eviction_run(
    json_response(StatusCode::OK, response)
 }

+async fn secondary_download_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    state
+        .secondary_controller
+        .download_tenant(tenant_id)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
+async fn secondary_upload_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    state
+        .secondary_controller
+        .upload_tenant(tenant_id)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(
        StatusCode::NOT_FOUND,
@@ -1751,6 +1849,16 @@ pub fn make_router(
        .put("/v1/deletion_queue/flush", |r| {
            api_handler(r, deletion_queue_flush)
        })
+        .post("/v1/secondary/:tenant_id/upload", |r| {
+            testing_api_handler("force heatmap upload", r, secondary_upload_handler)
+        })
+        .post("/v1/secondary/:tenant_id/download", |r| {
+            testing_api_handler(
+                "force secondary layer download",
+                r,
+                secondary_download_handler,
+            )
+        })
        .put("/v1/tenant/:tenant_id/break", |r| {
            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -149,6 +149,10 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
    }
 }

+// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
+// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
+// from the name.
+
 pub fn is_uninit_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1388,28 +1388,23 @@ impl TimelineMetrics {
        }
    }

-    pub fn record_new_file_metrics(&self, sz: u64) {
+    pub(crate) fn record_new_file_metrics(&self, sz: u64) {
        self.resident_physical_size_add(sz);
        self.num_persistent_files_created.inc_by(1);
        self.persistent_bytes_written.inc_by(sz);
    }

-    pub fn resident_physical_size_sub(&self, sz: u64) {
+    pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
        self.resident_physical_size_gauge.sub(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
    }

-    pub fn resident_physical_size_add(&self, sz: u64) {
+    pub(crate) fn resident_physical_size_add(&self, sz: u64) {
        self.resident_physical_size_gauge.add(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
    }

-    pub fn resident_physical_size_set(&self, sz: u64) {
-        self.resident_physical_size_gauge.set(sz);
-        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
-    }
-
-    pub fn resident_physical_size_get(&self) -> u64 {
+    pub(crate) fn resident_physical_size_get(&self) -> u64 {
        self.resident_physical_size_gauge.get()
    }
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1314,7 +1314,7 @@ async fn get_active_tenant_with_timeout(
    tenant_id: TenantId,
    _ctx: &RequestContext, /* require get a context to support cancellation in the future */
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
-    let tenant = match mgr::get_tenant(tenant_id, false).await {
+    let tenant = match mgr::get_tenant(tenant_id, false) {
        Ok(tenant) => tenant,
        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
        Err(GetTenantError::NotActive(_)) => {
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -675,8 +675,9 @@ impl Timeline {

        result.add_key(CONTROLFILE_KEY);
        result.add_key(CHECKPOINT_KEY);
-        result.add_key(AUX_FILES_KEY);
-
+        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
+            result.add_key(AUX_FILES_KEY);
+        }
        Ok(result.to_keyspace())
    }

@@ -1201,7 +1202,8 @@ impl<'a> DatadirModification<'a> {
        let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
            Ok(buf) => AuxFilesDirectory::des(&buf)?,
            Err(e) => {
-                warn!("Failed to get info about AUX files: {}", e);
+                // This is expected: historical databases do not have the key.
+                debug!("Failed to get info about AUX files: {}", e);
                AuxFilesDirectory {
                    files: HashMap::new(),
                }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -257,6 +257,12 @@ pub enum TaskKind {
    /// See [`crate::disk_usage_eviction_task`].
    DiskUsageEviction,

+    /// See [`crate::tenant::secondary`].
+    SecondaryDownloads,
+
+    /// See [`crate::tenant::secondary`].
+    SecondaryUploads,
+
    // Initial logical size calculation
    InitialLogicalSizeCalculation,

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -63,6 +63,7 @@ use self::timeline::TimelineResources;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
+use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::TENANT_ACTIVATION;
@@ -130,6 +131,7 @@ pub mod storage_layer;
 pub mod config;
 pub mod delete;
 pub mod mgr;
+pub mod secondary;
 pub mod tasks;
 pub mod upload_queue;

@@ -138,9 +140,7 @@ pub(crate) mod timeline;
 pub mod size;

 pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-pub use timeline::{
-    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
-};
+pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};

 // re-export for use in remote_timeline_client.rs
 pub use crate::tenant::metadata::save_metadata;
@@ -194,6 +194,45 @@ struct TimelinePreload {
    index_part: Result<MaybeDeletedIndexPart, DownloadError>,
 }

+/// To include the HeatmapWriter in tenant shutdown, we provide a hook
+/// for it to publish a barrier when upload is going on.  We will take
+/// this and wait on it during shutdown, ensuring that there is no
+/// upload going on once shutdown() returns.
+pub struct HeatmapHook {
+    // Mutually exclude shutdown and any in-flight uploads
+    //
+    // If this is None, we are shutting down
+    in_progress: Option<Arc<tokio::sync::Mutex<()>>>,
+}
+
+impl Default for HeatmapHook {
+    fn default() -> Self {
+        Self {
+            in_progress: Some(Arc::default()),
+        }
+    }
+}
+
+impl HeatmapHook {
+    pub(crate) fn enter(&self) -> Option<tokio::sync::OwnedMutexGuard<()>> {
+        self.in_progress.as_ref().map(|l| {
+            l.clone()
+                .try_lock_owned()
+                // expect: shutdown cannot have started yet or in_progress would have been None,
+                // so we expect that only one HeatmapWriter may take this lock at once.
+                // Depends on the invariant that HeatmapWriter is the only thing that calls
+                // enter(), and that it will never try and do uploads concurrently for the same
+                // tenant.
+                .expect("Tried to double-lock HeatmapHook")
+        })
+    }
+
+    /// Returns a lock that the caller should wait on before proceeding with shutdown
+    fn shutdown(&mut self) -> Arc<tokio::sync::Mutex<()>> {
+        self.in_progress.take().expect("Called shutdown twice")
+    }
+}
+
 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
 ///
@@ -245,6 +284,16 @@ pub struct Tenant {
    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,

    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
+
+    pub(crate) heatmap_hook: Mutex<HeatmapHook>,
+
+    pub(crate) cancel: CancellationToken,
+}
+
+impl std::fmt::Debug for Tenant {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} ({})", self.tenant_id, self.current_state())
+    }
 }

 pub(crate) enum WalRedoManager {
@@ -525,7 +574,7 @@ impl Tenant {
        tenant_id: TenantId,
        resources: TenantSharedResources,
        attached_conf: AttachedTenantConf,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
        expect_marker: AttachMarkerMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
@@ -892,7 +941,7 @@ impl Tenant {
        attached_conf: AttachedTenantConf,
        resources: TenantSharedResources,
        init_order: Option<InitializationOrder>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
        ctx: &RequestContext,
    ) -> Arc<Tenant> {
        span::debug_assert_current_span_has_tenant_id();
@@ -1057,8 +1106,8 @@ impl Tenant {
                    TimelineId::try_from(timeline_uninit_mark_file.file_stem())
                        .with_context(|| {
                            format!(
-                            "Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}",
-                        )
+                                "Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}",
+                            )
                        })?;
                let timeline_dir = self.conf.timeline_path(&self.tenant_id, &timeline_id);
                if let Err(e) =
@@ -1824,6 +1873,11 @@ impl Tenant {
        pitr: Duration,
        ctx: &RequestContext,
    ) -> anyhow::Result<GcResult> {
+        // Don't start doing work during shutdown
+        if let TenantState::Stopping { .. } = self.current_state() {
+            return Ok(GcResult::default());
+        }
+
        // there is a global allowed_error for this
        anyhow::ensure!(
            self.is_active(),
@@ -1852,6 +1906,12 @@ impl Tenant {
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        // Don't start doing work during shutdown
+        if let TenantState::Stopping { .. } = self.current_state() {
+            return Ok(());
+        }
+
+        // We should only be called once the tenant has activated.
        anyhow::ensure!(
            self.is_active(),
            "Cannot run compaction iteration on inactive tenant"
@@ -2014,6 +2074,8 @@ impl Tenant {
        // It's mesed up.
        // we just ignore the failure to stop

+        tracing::debug!("shutting down...");
+        self.cancel.cancel();
        match self.set_stopping(shutdown_progress, false, false).await {
            Ok(()) => {}
            Err(SetStoppingError::Broken) => {
@@ -2021,6 +2083,7 @@ impl Tenant {
            }
            Err(SetStoppingError::AlreadyStopping(other)) => {
                // give caller the option to wait for this this shutdown
+                info!("Tenant::shutdown: AlreadyStopping");
                return Err(other);
            }
        };
@@ -2034,6 +2097,7 @@ impl Tenant {
                js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
            })
        };
+        tracing::debug!("shutdown waiting for timelines...");
        while let Some(res) = js.join_next().await {
            match res {
                Ok(()) => {}
@@ -2043,12 +2107,23 @@ impl Tenant {
            }
        }

+        let heatmap_hook_lock = {
+            let mut hook = self.heatmap_hook.lock().unwrap();
+            hook.shutdown()
+        };
+        tracing::debug!("shutdown waiting heatmap uploads...");
+        // Take & drop lock to ensure any heatmap upload is complete.
+        drop(heatmap_hook_lock.lock().await);
+
+        tracing::debug!("shutdown waiting for tasks...");
        // shutdown all tenant and timeline tasks: gc, compaction, page service
        // No new tasks will be started for this tenant because it's in `Stopping` state.
        //
        // this will additionally shutdown and await all timeline tasks.
        task_mgr::shutdown_tasks(None, Some(self.tenant_id), None).await;

+        tracing::debug!("shutdown complete");
+
        Ok(())
    }

@@ -2298,6 +2373,9 @@ where
 }

 impl Tenant {
+    pub fn get_tenant_id(&self) -> TenantId {
+        self.tenant_id
+    }
    pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
        self.tenant_conf.read().unwrap().tenant_conf
    }
@@ -2544,6 +2622,8 @@ impl Tenant {
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
+            heatmap_hook: Mutex::default(),
+            cancel: CancellationToken::new(),
        }
    }

@@ -3353,6 +3433,30 @@ impl Tenant {
    pub fn cached_synthetic_size(&self) -> u64 {
        self.cached_synthetic_tenant_size.load(Ordering::Relaxed)
    }
+
+    /// Flush any in-progress layers, schedule uploads, and wait for uploads to complete.
+    ///
+    /// This function can take a long time: callers should wrap it in a timeout if calling
+    /// from an external API handler.
+    pub async fn flush_remote(&self) -> anyhow::Result<()> {
+        let timelines = self.timelines.lock().unwrap().clone();
+
+        for (timeline_id, timeline) in timelines {
+            tracing::info!(%timeline_id, "Flushing...");
+            timeline.freeze_and_flush().await?;
+            tracing::info!(%timeline_id, "Waiting for uploads...");
+            if let Some(client) = &timeline.remote_client {
+                client.wait_completion().await?;
+            }
+        }
+
+        match self.deletion_queue_client.flush_execute().await {
+            Ok(_) => {}
+            Err(DeletionQueueError::ShuttingDown) => {}
+        }
+
+        Ok(())
+    }
 }

 fn remove_timeline_and_uninit_mark(
@@ -4328,6 +4432,7 @@ mod tests {

    #[tokio::test]
    async fn delta_layer_dumping() -> anyhow::Result<()> {
+        use storage_layer::AsLayerDesc;
        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -4335,16 +4440,18 @@ mod tests {
        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        let layer_map = tline.layers.read().await;
-        let level0_deltas = layer_map.layer_map().get_level0_deltas()?;
+        let level0_deltas = layer_map
+            .layer_map()
+            .get_level0_deltas()?
+            .into_iter()
+            .map(|desc| layer_map.get_from_desc(&desc))
+            .collect::<Vec<_>>();

        assert!(!level0_deltas.is_empty());

        for delta in level0_deltas {
-            let delta = layer_map.get_from_desc(&delta);
            // Ensure we are dumping a delta layer here
-            let delta = delta.downcast_delta_layer().unwrap();
-
-            delta.dump(false, &ctx).await.unwrap();
+            assert!(delta.layer_desc().is_delta);
            delta.dump(true, &ctx).await.unwrap();
        }

@@ -4379,7 +4486,7 @@ mod tests {
        metadata_bytes[8] ^= 1;
        std::fs::write(metadata_path, metadata_bytes)?;

-        let err = harness.try_load(&ctx).await.err().expect("should fail");
+        let err = harness.try_load(&ctx).await.expect_err("should fail");
        // get all the stack with all .context, not only the last one
        let message = format!("{err:#}");
        let expected = "failed to load metadata";
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -21,7 +21,7 @@ use crate::{
 };

 use super::{
-    mgr::{GetTenantError, TenantsMap},
+    mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
    span,
    timeline::delete::DeleteTimelineFlow,
@@ -35,12 +35,21 @@ pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
    Get(#[from] GetTenantError),

+    #[error("Tenant not attached")]
+    NotAttached,
+
    #[error("Invalid state {0}. Expected Active or Broken")]
    InvalidState(TenantState),

    #[error("Tenant deletion is already in progress")]
    AlreadyInProgress,

+    #[error("Tenant map slot error {0}")]
+    SlotError(#[from] TenantSlotError),
+
+    #[error("Tenant map slot upsert error {0}")]
+    SlotUpsertError(#[from] TenantSlotUpsertError),
+
    #[error("Timeline {0}")]
    Timeline(#[from] DeleteTimelineError),

@@ -301,12 +310,12 @@ impl DeleteTenantFlow {
    pub(crate) async fn run(
        conf: &'static PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        tenant_id: TenantId,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenant: Arc<Tenant>,
    ) -> Result<(), DeleteTenantError> {
        span::debug_assert_current_span_has_tenant_id();

-        let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
+        let mut guard = Self::prepare(&tenant).await?;

        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
            tenant.set_broken(format!("{e:#}")).await;
@@ -411,7 +420,7 @@ impl DeleteTenantFlow {
        guard: DeletionGuard,
        tenant: &Arc<Tenant>,
        init_order: Option<&InitializationOrder>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
@@ -448,7 +457,7 @@ impl DeleteTenantFlow {
    pub(crate) async fn resume_from_attach(
        guard: DeletionGuard,
        tenant: &Arc<Tenant>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
@@ -474,15 +483,8 @@ impl DeleteTenantFlow {
    }

    async fn prepare(
-        tenants: &tokio::sync::RwLock<TenantsMap>,
-        tenant_id: TenantId,
-    ) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
-        let m = tenants.read().await;
-
-        let tenant = m
-            .get(&tenant_id)
-            .ok_or(GetTenantError::NotFound(tenant_id))?;
-
+        tenant: &Arc<Tenant>,
+    ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
        // so at least for now allow deletions only for active tenants. TODO recheck
        // Broken and Stopping is needed for retries.
@@ -516,14 +518,14 @@ impl DeleteTenantFlow {
            )));
        }

-        Ok((Arc::clone(tenant), guard))
+        Ok(guard)
    }

    fn schedule_background(
        guard: OwnedMutexGuard<Self>,
        conf: &'static PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
    ) {
        let tenant_id = tenant.tenant_id;
@@ -556,7 +558,7 @@ impl DeleteTenantFlow {
        mut guard: OwnedMutexGuard<Self>,
        conf: &PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
        tenant: &Arc<Tenant>,
    ) -> Result<(), DeleteTenantError> {
        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
@@ -604,7 +606,7 @@ impl DeleteTenantFlow {
            .await
            .context("cleanup_remaining_fs_traces")?;

-        let mut locked = tenants.write().await;
+        let mut locked = tenants.write().unwrap();
        if locked.remove(&tenant.tenant_id).is_none() {
            warn!("Tenant got removed from tenants map during deletion");
        };
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -639,147 +639,10 @@ impl LayerMap {
        }

        println!("historic_layers:");
-        for layer in self.iter_historic_layers() {
-            layer.dump(verbose, ctx)?;
+        for desc in self.iter_historic_layers() {
+            desc.dump();
        }
        println!("End dump LayerMap");
        Ok(())
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::LayerMap;
-    use crate::tenant::storage_layer::LayerFileName;
-    use std::str::FromStr;
-    use std::sync::Arc;
-
-    mod l0_delta_layers_updated {
-
-        use crate::tenant::{
-            storage_layer::{AsLayerDesc, PersistentLayerDesc},
-            timeline::layer_manager::LayerFileManager,
-        };
-
-        use super::*;
-
-        struct LayerObject(PersistentLayerDesc);
-
-        impl AsLayerDesc for LayerObject {
-            fn layer_desc(&self) -> &PersistentLayerDesc {
-                &self.0
-            }
-        }
-
-        impl LayerObject {
-            fn new(desc: PersistentLayerDesc) -> Self {
-                LayerObject(desc)
-            }
-        }
-
-        type TestLayerFileManager = LayerFileManager<LayerObject>;
-
-        #[test]
-        fn for_full_range_delta() {
-            // l0_delta_layers are used by compaction, and should observe all buffered updates
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
-                 true
-             )
-        }
-
-        #[test]
-        fn for_non_full_range_delta() {
-            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
-                 // because not full range
-                 false
-             )
-        }
-
-        #[test]
-        fn for_image() {
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
-                 // code only checks if it is a full range layer, doesn't care about images, which must
-                 // mean we should in practice never have full range images
-                 false
-             )
-        }
-
-        #[test]
-        fn replacing_missing_l0_is_notfound() {
-            // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
-            // however only happen for precondition failures.
-
-            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
-            let layer = LayerFileName::from_str(layer).unwrap();
-            let layer = PersistentLayerDesc::from(layer);
-
-            // same skeletan construction; see scenario below
-            let not_found = Arc::new(LayerObject::new(layer.clone()));
-            let new_version = Arc::new(LayerObject::new(layer));
-
-            // after the immutable storage state refactor, the replace operation
-            // will not use layer map any more. We keep it here for consistency in test cases
-            // and can remove it in the future.
-            let _map = LayerMap::default();
-
-            let mut mapping = TestLayerFileManager::new();
-
-            mapping
-                .replace_and_verify(not_found, new_version)
-                .unwrap_err();
-        }
-
-        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
-            let name = LayerFileName::from_str(layer_name).unwrap();
-            let skeleton = PersistentLayerDesc::from(name);
-
-            let remote = Arc::new(LayerObject::new(skeleton.clone()));
-            let downloaded = Arc::new(LayerObject::new(skeleton));
-
-            let mut map = LayerMap::default();
-            let mut mapping = LayerFileManager::new();
-
-            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
-            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
-            assert_eq!(remote.layer_desc(), downloaded.layer_desc());
-
-            let expected_in_counts = (1, usize::from(expected_l0));
-
-            map.batch_update()
-                .insert_historic(remote.layer_desc().clone());
-            mapping.insert(remote.clone());
-            assert_eq!(
-                count_layer_in(&map, remote.layer_desc()),
-                expected_in_counts
-            );
-
-            mapping
-                .replace_and_verify(remote, downloaded.clone())
-                .expect("name derived attributes are the same");
-            assert_eq!(
-                count_layer_in(&map, downloaded.layer_desc()),
-                expected_in_counts
-            );
-
-            map.batch_update().remove_historic(downloaded.layer_desc());
-            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
-        }
-
-        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
-            let historic = map
-                .iter_historic_layers()
-                .filter(|x| x.key() == layer.key())
-                .count();
-            let l0s = map
-                .get_level0_deltas()
-                .expect("why does this return a result");
-            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
-
-            (historic, l0)
-        }
-    }
-}
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -57,8 +57,7 @@ pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> {
    fsync_in_thread_pool(paths)
 }

-/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
-/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
+/// Parallel fsync asynchronously.
 pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> {
    const MAX_CONCURRENT_FSYNC: usize = 64;
    let mut next = paths.iter().peekable();
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -167,8 +167,6 @@
 //!   - download their remote [`IndexPart`]s
 //!   - create `Timeline` struct and a `RemoteTimelineClient`
 //!   - initialize the client's upload queue with its `IndexPart`
-//!   - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
-//!     for layers that are referenced by `IndexPart` but not present locally
 //!   - schedule uploads for layers that are only present locally.
 //!   - if the remote `IndexPart`'s metadata was newer than the metadata in
 //!     the local filesystem, write the remote metadata to the local filesystem
@@ -204,15 +202,14 @@
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map

-mod download;
+pub(crate) mod download;
 pub mod index;
 mod upload;

 use anyhow::Context;
 use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
-// re-export these
-pub use download::{is_temp_download_file, list_remote_timelines};
+
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use utils::backoff::{
@@ -237,7 +234,7 @@ use crate::metrics::{
 };
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::storage_layer::AsLayerDesc;
 use crate::tenant::upload_queue::Delete;
 use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
@@ -255,10 +252,13 @@ use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

-use super::storage_layer::LayerFileName;
+use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;

+pub(crate) use download::{is_temp_download_file, list_remote_timelines};
+pub(crate) use index::LayerFileMetadata;
+
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
@@ -627,101 +627,181 @@ impl RemoteTimelineClient {
    ///
    /// Launch an upload operation in the background.
    ///
-    pub fn schedule_layer_file_upload(
+    pub(crate) fn schedule_layer_file_upload(
        self: &Arc<Self>,
-        layer_file_name: &LayerFileName,
-        layer_metadata: &LayerFileMetadata,
+        layer: ResidentLayer,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        upload_queue
-            .latest_files
-            .insert(layer_file_name.clone(), layer_metadata.clone());
-        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-
-        let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
-        self.calls_unfinished_metric_begin(&op);
-        upload_queue.queued_operations.push_back(op);
-
-        info!("scheduled layer file upload {layer_file_name}");
-
-        // Launch the task immediately, if possible
+        self.schedule_layer_file_upload0(upload_queue, layer);
        self.launch_queued_tasks(upload_queue);
        Ok(())
    }

+    fn schedule_layer_file_upload0(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+        layer: ResidentLayer,
+    ) {
+        let metadata = layer.metadata();
+
+        upload_queue
+            .latest_files
+            .insert(layer.layer_desc().filename(), metadata.clone());
+        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+
+        info!("scheduled layer file upload {layer}");
+        let op = UploadOp::UploadLayer(layer, metadata);
+        self.calls_unfinished_metric_begin(&op);
+        upload_queue.queued_operations.push_back(op);
+    }
+
    /// Launch a delete operation in the background.
    ///
-    /// The operation does not modify local state but assumes the local files have already been
-    /// deleted, and is used to mirror those changes to remote.
+    /// The operation does not modify local filesystem state.
    ///
    /// Note: This schedules an index file upload before the deletions.  The
-    /// deletion won't actually be performed, until any previously scheduled
+    /// deletion won't actually be performed, until all previously scheduled
    /// upload operations, and the index file upload, have completed
    /// successfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
-        names: Vec<LayerFileName>,
+        names: &[LayerFileName],
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

+        let with_generations =
+            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
+
+        self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
+
+        // Launch the tasks immediately, if possible
+        self.launch_queued_tasks(upload_queue);
+        Ok(())
+    }
+
+    /// Unlinks the layer files from `index_part.json` but does not yet schedule deletion for the
+    /// layer files, leaving them dangling.
+    ///
+    /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`]
+    /// is invoked on them.
+    pub(crate) fn schedule_gc_update(self: &Arc<Self>, gc_layers: &[Layer]) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        // just forget the return value; after uploading the next index_part.json, we can consider
+        // the layer files as "dangling". this is fine, at worst case we create work for the
+        // scrubber.
+
+        let names = gc_layers.iter().map(|x| x.layer_desc().filename());
+
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
+
+        self.launch_queued_tasks(upload_queue);
+
+        Ok(())
+    }
+
+    /// Update the remote index file, removing the to-be-deleted files from the index,
+    /// allowing scheduling of actual deletions later.
+    fn schedule_unlinking_of_layers_from_index_part0<I>(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+        names: I,
+    ) -> Vec<(LayerFileName, Generation)>
+    where
+        I: IntoIterator<Item = LayerFileName>,
+    {
        // Deleting layers doesn't affect the values stored in TimelineMetadata,
        // so we don't need update it. Just serialize it.
        let metadata = upload_queue.latest_metadata.clone();

-        // Update the remote index file, removing the to-be-deleted files from the index,
-        // before deleting the actual files.
-        //
-        // Once we start removing files from upload_queue.latest_files, there's
-        // no going back! Otherwise, some of the files would already be removed
-        // from latest_files, but not yet scheduled for deletion. Use a closure
-        // to syntactically forbid ? or bail! calls here.
-        let no_bail_here = || {
-            // Decorate our list of names with each name's generation, dropping
-            // makes that are unexpectedly missing from our metadata.
-            let with_generations: Vec<_> = names
-                .into_iter()
-                .filter_map(|name| {
-                    // Remove from latest_files, learning the file's remote generation in the process
-                    let meta = upload_queue.latest_files.remove(&name);
+        // Decorate our list of names with each name's generation, dropping
+        // names that are unexpectedly missing from our metadata.
+        let with_generations: Vec<_> = names
+            .into_iter()
+            .filter_map(|name| {
+                let meta = upload_queue.latest_files.remove(&name);

-                    if let Some(meta) = meta {
-                        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                        Some((name, meta.generation))
-                    } else {
-                        // This can only happen if we forgot to to schedule the file upload
-                        // before scheduling the delete. Log it because it is a rare/strange
-                        // situation, and in case something is misbehaving, we'd like to know which
-                        // layers experienced this.
-                        info!(
-                            "Deleting layer {name} not found in latest_files list, never uploaded?"
-                        );
-                        None
-                    }
-                })
-                .collect();
+                if let Some(meta) = meta {
+                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                    Some((name, meta.generation))
+                } else {
+                    // This can only happen if we forgot to to schedule the file upload
+                    // before scheduling the delete. Log it because it is a rare/strange
+                    // situation, and in case something is misbehaving, we'd like to know which
+                    // layers experienced this.
+                    info!("Deleting layer {name} not found in latest_files list, never uploaded?");
+                    None
+                }
+            })
+            .collect();

-            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-                self.schedule_index_upload(upload_queue, metadata);
-            }
+        // after unlinking files from the upload_queue.latest_files we must always schedule an
+        // index_part update, because that needs to be uploaded before we can actually delete the
+        // files.
+        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
+            self.schedule_index_upload(upload_queue, metadata);
+        }

-            for (name, gen) in &with_generations {
-                info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
-            }
+        with_generations
+    }

-            // schedule the actual deletions
-            let op = UploadOp::Delete(Delete {
-                layers: with_generations,
-            });
-            self.calls_unfinished_metric_begin(&op);
-            upload_queue.queued_operations.push_back(op);
+    /// Schedules deletion for layer files which have previously been unlinked from the
+    /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
+    pub(crate) fn schedule_deletion_of_unlinked(
+        self: &Arc<Self>,
+        layers: Vec<(LayerFileName, Generation)>,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        self.schedule_deletion_of_unlinked0(upload_queue, layers);
+        self.launch_queued_tasks(upload_queue);
+        Ok(())
+    }
+
+    fn schedule_deletion_of_unlinked0(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+        with_generations: Vec<(LayerFileName, Generation)>,
+    ) {
+        for (name, gen) in &with_generations {
+            info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
+        }
+
+        // schedule the actual deletions
+        let op = UploadOp::Delete(Delete {
+            layers: with_generations,
+        });
+        self.calls_unfinished_metric_begin(&op);
+        upload_queue.queued_operations.push_back(op);
+    }
+
+    /// Schedules a compaction update to the remote `index_part.json`.
+    ///
+    /// `compacted_from` represent the L0 names which have been `compacted_to` L1 layers.
+    pub(crate) fn schedule_compaction_update(
+        self: &Arc<Self>,
+        compacted_from: &[Layer],
+        compacted_to: &[ResidentLayer],
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        for layer in compacted_to {
+            self.schedule_layer_file_upload0(upload_queue, layer.clone());
+        }
+
+        let names = compacted_from.iter().map(|x| x.layer_desc().filename());
+
+        let with_generations =
+            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
+        self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
+        self.launch_queued_tasks(upload_queue);

-            // Launch the tasks immediately, if possible
-            self.launch_queued_tasks(upload_queue);
-        };
-        no_bail_here();
        Ok(())
    }

@@ -1093,16 +1173,12 @@ impl RemoteTimelineClient {
            }

            let upload_result: anyhow::Result<()> = match &task.op {
-                UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
-                    let path = self
-                        .conf
-                        .timeline_path(&self.tenant_id, &self.timeline_id)
-                        .join(layer_file_name.file_name());
-
+                UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
+                    let path = layer.local_path();
                    upload::upload_timeline_layer(
                        self.conf,
                        &self.storage_impl,
-                        &path,
+                        path,
                        layer_metadata,
                        self.generation,
                    )
@@ -1420,11 +1496,21 @@ impl RemoteTimelineClient {
        }
    }

-    pub(crate) fn get_layer_metadata(
+    pub(crate) fn get_layers_metadata(
        &self,
-        name: &LayerFileName,
-    ) -> anyhow::Result<Option<LayerFileMetadata>> {
-        self.upload_queue.lock().unwrap().get_layer_metadata(name)
+        layers: Vec<LayerFileName>,
+    ) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
+        let q = self.upload_queue.lock().unwrap();
+        let q = match &*q {
+            UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
+                anyhow::bail!("queue is in state {}", q.as_str())
+            }
+            UploadQueue::Initialized(inner) => inner,
+        };
+
+        let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
+
+        Ok(decorated.collect())
    }
 }

@@ -1466,6 +1552,13 @@ pub fn remote_index_path(
    .expect("Failed to construct path")
 }

+pub const HEATMAP_BASENAME: &str = "heatmap";
+
+pub fn remote_heatmap_path(tenant_id: &TenantId) -> RemotePath {
+    RemotePath::from_string(&format!("tenants/{tenant_id}/{HEATMAP_BASENAME}-v01"))
+        .expect("Failed to construct path")
+}
+
 /// Given the key of an index, parse out the generation part of the name
 pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
    let file_name = match path.get_path().file_name() {
@@ -1513,6 +1606,7 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
+            storage_layer::Layer,
            Generation, Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
@@ -1681,32 +1775,29 @@ mod tests {
        let generation = harness.generation;

        // Create a couple of dummy files,  schedule upload for them
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
-        let layer_file_name_2: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap();
-        let layer_file_name_3: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap();
-        let content_1 = dummy_contents("foo");
-        let content_2 = dummy_contents("bar");
-        let content_3 = dummy_contents("baz");

-        for (filename, content) in [
-            (&layer_file_name_1, &content_1),
-            (&layer_file_name_2, &content_2),
-            (&layer_file_name_3, &content_3),
-        ] {
-            std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
-        }
+        let layers = [
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), dummy_contents("foo")),
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap(), dummy_contents("bar")),
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
+        ]
+        .into_iter()
+        .map(|(name, contents): (LayerFileName, Vec<u8>)| {
+            std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
+
+            Layer::for_resident(
+                harness.conf,
+                &timeline,
+                name,
+                LayerFileMetadata::new(contents.len() as u64, generation),
+            )
+        }).collect::<Vec<_>>();

        client
-            .schedule_layer_file_upload(
-                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64, generation),
-            )
+            .schedule_layer_file_upload(layers[0].clone())
            .unwrap();
        client
-            .schedule_layer_file_upload(
-                &layer_file_name_2,
-                &LayerFileMetadata::new(content_2.len() as u64, generation),
-            )
+            .schedule_layer_file_upload(layers[1].clone())
            .unwrap();

        // Check that they are started immediately, not queued
@@ -1760,38 +1851,42 @@ mod tests {
                .collect(),
            &[
                &initial_layer.file_name(),
-                &layer_file_name_1.file_name(),
-                &layer_file_name_2.file_name(),
+                &layers[0].layer_desc().filename().file_name(),
+                &layers[1].layer_desc().filename().file_name(),
            ],
        );
        assert_eq!(index_part.metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
        client
-            .schedule_layer_file_upload(
-                &layer_file_name_3,
-                &LayerFileMetadata::new(content_3.len() as u64, generation),
-            )
+            .schedule_layer_file_upload(layers[2].clone())
            .unwrap();
+
+        // this is no longer consistent with how deletion works with Layer::drop, but in this test
+        // keep using schedule_layer_file_deletion because we don't have a way to wait for the
+        // spawn_blocking started by the drop.
        client
-            .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
+            .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();

            // Deletion schedules upload of the index file, and the file deletion itself
-            assert!(upload_queue.queued_operations.len() == 2);
-            assert!(upload_queue.inprogress_tasks.len() == 1);
-            assert!(upload_queue.num_inprogress_layer_uploads == 1);
-            assert!(upload_queue.num_inprogress_deletions == 0);
-            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
+            assert_eq!(upload_queue.queued_operations.len(), 2);
+            assert_eq!(upload_queue.inprogress_tasks.len(), 1);
+            assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
+            assert_eq!(upload_queue.num_inprogress_deletions, 0);
+            assert_eq!(
+                upload_queue.latest_files_changes_since_metadata_upload_scheduled,
+                0
+            );
        }
        assert_remote_files(
            &[
                &initial_layer.file_name(),
-                &layer_file_name_1.file_name(),
-                &layer_file_name_2.file_name(),
+                &layers[0].layer_desc().filename().file_name(),
+                &layers[1].layer_desc().filename().file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1805,8 +1900,8 @@ mod tests {
        assert_remote_files(
            &[
                &initial_layer.file_name(),
-                &layer_file_name_2.file_name(),
-                &layer_file_name_3.file_name(),
+                &layers[1].layer_desc().filename().file_name(),
+                &layers[2].layer_desc().filename().file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1835,6 +1930,13 @@ mod tests {
        )
        .unwrap();

+        let layer_file_1 = Layer::for_resident(
+            harness.conf,
+            &timeline,
+            layer_file_name_1.clone(),
+            LayerFileMetadata::new(content_1.len() as u64, harness.generation),
+        );
+
        #[derive(Debug, PartialEq, Clone, Copy)]
        struct BytesStartedFinished {
            started: Option<usize>,
@@ -1870,10 +1972,7 @@ mod tests {
        let actual_a = get_bytes_started_stopped();

        client
-            .schedule_layer_file_upload(
-                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64, harness.generation),
-            )
+            .schedule_layer_file_upload(layer_file_1.clone())
            .unwrap();

        let actual_b = get_bytes_started_stopped();
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -60,6 +60,8 @@ pub(super) async fn upload_timeline_layer<'a>(
        bail!("failpoint before-upload-layer")
    });

+    pausable_failpoint!("before-upload-layer-pausable");
+
    let storage_path = remote_path(conf, source_path, generation)?;
    let source_file_res = fs::File::open(&source_path).await;
    let source_file = match source_file_res {
@@ -70,6 +72,8 @@ pub(super) async fn upload_timeline_layer<'a>(
            // upload. However, a nonexistent file can also be indicative of
            // something worse, like when a file is scheduled for upload before
            // it has been written to disk yet.
+            //
+            // This is tested against `test_compaction_delete_before_upload`
            info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -0,0 +1,268 @@
+pub mod downloader;
+pub mod heatmap;
+pub mod heatmap_writer;
+
+use std::{sync::Arc, time::SystemTime};
+
+use crate::{
+    config::PageServerConf,
+    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+};
+
+use self::{
+    downloader::{downloader_task, SecondaryDetail},
+    heatmap_writer::heatmap_writer_task,
+};
+
+use super::{
+    mgr::TenantManager,
+    storage_layer::{AsLayerDesc, Layer},
+    timeline::DiskUsageEvictionInfo,
+};
+
+use remote_storage::GenericRemoteStorage;
+
+use tokio_util::sync::CancellationToken;
+use utils::{
+    completion::Barrier,
+    fs_ext,
+    id::{TenantId, TimelineId},
+};
+
+enum DownloadCommand {
+    Download(TenantId),
+}
+enum UploadCommand {
+    Upload(TenantId),
+}
+
+struct CommandRequest<T> {
+    payload: T,
+    response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
+}
+
+struct CommandResponse {
+    result: anyhow::Result<()>,
+}
+
+// Whereas [`Tenant`] represents an attached tenant, this type represents the work
+// we do for secondary tenant locations: where we are not serving clients or
+// ingesting WAL, but we are maintaining a warm cache of layer files.
+//
+// This type is all about the _download_ path for secondary mode.  The upload path
+// runs while a regular attached `Tenant` exists.
+//
+// This structure coordinates TenantManager and SecondaryDownloader,
+// so that the downloader can indicate which tenants it is currently
+// operating on, and the manager can indicate when a particular
+// secondary tenant should cancel any work in flight.
+#[derive(Debug)]
+pub(crate) struct SecondaryTenant {
+    /// Cancellation token indicates to SecondaryDownloader that it should stop doing
+    /// any work for this tenant at the next opportunity.
+    pub(crate) cancel: CancellationToken,
+
+    /// Lock must be held by SecondaryDownloader at any time that it might be operating
+    /// on the local filesystem directory for this tenant ID.
+    // Ordering: the TenantManager must set the cancellation token _before_
+    // taking the lock.  The SecondaryDownloader must always check the cancellation
+    // token immediately _after_ taking the lock (and at appropriate intervals
+    // while holding it).
+    pub(crate) busy: Arc<tokio::sync::Mutex<()>>,
+
+    detail: std::sync::Mutex<SecondaryDetail>,
+    // TODO: propagate the `warm` from LocationConf into here, and respect it when doing downloads
+}
+
+impl SecondaryTenant {
+    pub(crate) fn new() -> Arc<Self> {
+        // TODO; consider whether we really need to Arc this
+        Arc::new(Self {
+            busy: Arc::new(tokio::sync::Mutex::new(())),
+            // todo: shall we make this a descendent of the
+            // main cancellation token, or is it sufficient that
+            // on shutdown we walk the tenants and fire their
+            // individual cancellations?
+            cancel: CancellationToken::new(),
+
+            detail: std::sync::Mutex::default(),
+        })
+    }
+
+    pub(crate) async fn shutdown(&self) {
+        self.cancel.cancel();
+
+        // Wait for any secondary downloader work to complete: once we
+        // acquire this lock, we are guaranteed that the secondary downloader
+        // won't touch the local filesystem again for this instance: it is safe
+        // to e.g. construct a `Tenant` for the same TenantId
+        drop(self.busy.lock().await);
+    }
+
+    pub(crate) fn get_layers_for_eviction(&self) -> Vec<(TimelineId, DiskUsageEvictionInfo)> {
+        self.detail.lock().unwrap().get_layers_for_eviction()
+    }
+
+    pub(crate) async fn evict_layers(
+        &self,
+        _guard: tokio::sync::OwnedMutexGuard<()>,
+        conf: &PageServerConf,
+        tenant_id: &TenantId,
+        layers: Vec<(TimelineId, Layer)>,
+    ) {
+        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
+
+        if self.cancel.is_cancelled() {
+            // Eviction is a no-op if shutdown() was already called.
+            tracing::info!(
+                "Dropping {} layer evictions, secondary tenant shutting down",
+                layers.len()
+            );
+            return;
+        }
+
+        let now = SystemTime::now();
+
+        for (timeline_id, layer) in layers {
+            let layer_name = layer.layer_desc().filename();
+            let path = conf
+                .timeline_path(tenant_id, &timeline_id)
+                .join(&layer_name.file_name());
+
+            // We tolerate ENOENT, because between planning eviction and executing
+            // it, the secondary downloader could have seen an updated heatmap that
+            // resulted in a layer being deleted.
+            tokio::fs::remove_file(path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+                .expect("TODO: terminate process on local I/O errors");
+
+            // TODO: batch up updates instead of acquiring lock in inner loop
+            let mut detail = self.detail.lock().unwrap();
+            // If there is no timeline detail for what we just deleted, that indicates that
+            // the secondary downloader did some work (perhaps removing all)
+            if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
+                timeline_detail.on_disk_layers.remove(&layer_name);
+                timeline_detail.evicted_at.insert(layer_name, now);
+            }
+        }
+    }
+}
+
+/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
+/// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
+/// where we want to immediately upload/download for a particular tenant.  In normal operation
+/// uploads & downloads are autonomous and not driven by this interface.
+pub struct SecondaryController {
+    upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
+
+    download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
+}
+
+impl SecondaryController {
+    async fn dispatch<T>(
+        &self,
+        queue: &tokio::sync::mpsc::Sender<CommandRequest<T>>,
+        payload: T,
+    ) -> anyhow::Result<()> {
+        let (response_tx, response_rx) = tokio::sync::oneshot::channel();
+
+        queue
+            .send(CommandRequest {
+                payload,
+                response_tx,
+            })
+            .await
+            .map_err(|_| anyhow::anyhow!("Receiver shut down"))?;
+
+        let response = response_rx
+            .await
+            .map_err(|_| anyhow::anyhow!("Request dropped"))?;
+
+        response.result
+    }
+
+    pub async fn download_tenant(&self, tenant_id: TenantId) -> anyhow::Result<()> {
+        self.dispatch(&self.download_req_tx, DownloadCommand::Download(tenant_id))
+            .await
+    }
+
+    pub async fn upload_tenant(&self, tenant_id: TenantId) -> anyhow::Result<()> {
+        self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_id))
+            .await
+    }
+}
+
+pub fn spawn_tasks(
+    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) -> SecondaryController {
+    let mgr_clone = tenant_manager.clone();
+    let storage_clone = remote_storage.clone();
+    let cancel_clone = cancel.clone();
+    let bg_jobs_clone = background_jobs_can_start.clone();
+
+    let (download_req_tx, download_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
+    let (upload_req_tx, upload_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
+
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::SecondaryDownloads,
+        None,
+        None,
+        "secondary tenant downloads",
+        false,
+        async move {
+            downloader_task(
+                conf,
+                mgr_clone,
+                storage_clone,
+                download_req_rx,
+                bg_jobs_clone,
+                cancel_clone,
+            )
+            .await
+        },
+    );
+
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::SecondaryDownloads,
+        None,
+        None,
+        "heatmap uploads",
+        false,
+        async move {
+            heatmap_writer_task(
+                tenant_manager,
+                remote_storage,
+                upload_req_rx,
+                background_jobs_can_start,
+                cancel,
+            )
+            .await
+        },
+    );
+
+    SecondaryController {
+        download_req_tx,
+        upload_req_tx,
+    }
+}
+
+/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
+pub fn null_controller() -> SecondaryController {
+    let (download_req_tx, _download_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
+    let (upload_req_tx, _upload_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
+    SecondaryController {
+        upload_req_tx,
+        download_req_tx,
+    }
+}
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -0,0 +1,579 @@
+use std::{
+    collections::{HashMap, HashSet},
+    str::FromStr,
+    sync::Arc,
+    time::{Duration, Instant, SystemTime},
+};
+
+use crate::{
+    config::PageServerConf,
+    tenant::{
+        remote_timeline_client::index::LayerFileMetadata,
+        secondary::CommandResponse,
+        storage_layer::{Layer, LayerFileName},
+        timeline::{DiskUsageEvictionInfo, LocalLayerInfoForDiskUsageEviction},
+    },
+    METADATA_FILE_NAME,
+};
+
+use super::SecondaryTenant;
+use crate::tenant::{
+    mgr::TenantManager,
+    remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
+};
+use anyhow::Context;
+
+use chrono::format::{DelayedFormat, StrftimeItems};
+use remote_storage::GenericRemoteStorage;
+
+use tokio_util::sync::CancellationToken;
+use tracing::Instrument;
+use utils::{
+    completion::Barrier,
+    fs_ext,
+    id::{TenantId, TimelineId},
+};
+
+use super::{
+    heatmap::{HeatMapTenant, HeatMapTimeline},
+    CommandRequest, DownloadCommand,
+};
+
+/// Interval between checking if any Secondary tenants have download work to do:
+/// note that this is _not_ the frequency with which we actually freshen the tenants,
+/// just the frequency with which we wake up to decide whether anyone needs freshening.
+///
+/// Making this somewhat infrequent reduces the load on mutexes inside TenantManager
+/// and SecondaryTenant for reads when checking for work to do.
+const DOWNLOAD_CHECK_INTERVAL: Duration = Duration::from_millis(10000);
+
+/// For each tenant, how long must have passed since the last freshen_tenant call before
+/// calling it again.  This is approximately the time by which local data is allowed
+/// to fall behind remote data.
+///
+/// TODO: this should be an upper bound, and tenants that are uploading regularly
+/// should adaptively freshen more often (e.g. a tenant writing 1 layer per second
+/// should not wait a minute between freshens)
+const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
+
+#[derive(Debug, Clone)]
+pub(super) struct OnDiskState {
+    layer: Layer,
+    access_time: SystemTime,
+}
+
+impl OnDiskState {
+    fn new(
+        conf: &'static PageServerConf,
+        tenant_id: &TenantId,
+        timeline_id: &TimelineId,
+        name: LayerFileName,
+        metadata: LayerFileMetadata,
+        access_time: SystemTime,
+    ) -> Self {
+        Self {
+            layer: Layer::for_secondary(conf, tenant_id, timeline_id, name, metadata),
+            access_time,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
+pub(super) struct SecondaryDetailTimeline {
+    pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
+
+    /// We remember when layers were evicted, to prevent re-downloading them.
+    /// TODO: persist this, so that we don't try and re-download everything on restart.
+    pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
+}
+
+/// This state is written by the secondary downloader, it is opaque
+/// to TenantManager
+#[derive(Default, Debug)]
+pub(super) struct SecondaryDetail {
+    freshened_at: Option<Instant>,
+    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
+}
+
+/// Helper for logging SystemTime
+fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
+    let datetime: chrono::DateTime<chrono::Utc> = (*t).into();
+    datetime.format("%d/%m/%Y %T")
+}
+
+impl SecondaryDetail {
+    pub(super) fn get_layers_for_eviction(&self) -> Vec<(TimelineId, DiskUsageEvictionInfo)> {
+        let mut result = Vec::new();
+        for (timeline_id, timeline_detail) in &self.timelines {
+            let layers: Vec<_> = timeline_detail
+                .on_disk_layers
+                .values()
+                .map(|ods| LocalLayerInfoForDiskUsageEviction {
+                    layer: ods.layer.clone(),
+                    last_activity_ts: ods.access_time,
+                })
+                .collect();
+
+            let max_layer_size = layers.iter().map(|l| l.layer.metadata().file_size()).max();
+
+            result.push((
+                *timeline_id,
+                DiskUsageEvictionInfo {
+                    resident_layers: layers,
+                    max_layer_size,
+                },
+            ))
+        }
+
+        result
+    }
+}
+
+/// Keep trying to do downloads until the cancellation token is fired.  Remote storage
+/// errors are handled internally: any error returned by this function is an unexpected
+/// internal error of some kind.
+pub(super) async fn downloader_task(
+    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) -> anyhow::Result<()> {
+    let downloader = SecondaryDownloader {
+        conf,
+        tenant_manager,
+        remote_storage,
+        cancel: cancel.clone(),
+    };
+
+    tracing::info!("Waiting for background_jobs_can start...");
+    background_jobs_can_start.wait().await;
+    tracing::info!("background_jobs_can is ready, proceeding.");
+
+    while !cancel.is_cancelled() {
+        downloader.iteration().await?;
+
+        tokio::select! {
+            _ = cancel.cancelled() => {
+                tracing::info!("Heatmap writer terminating");
+                break;
+            },
+            _ = tokio::time::sleep(DOWNLOAD_CHECK_INTERVAL) => {},
+            cmd = command_queue.recv() => {
+                let cmd = match cmd {
+                    Some(c) =>c,
+                    None => {
+                        // SecondaryController was destroyed, and this has raced with
+                        // our CancellationToken
+                        tracing::info!("Heatmap writer terminating");
+                        break;
+                    }
+                };
+
+                let CommandRequest{
+                    response_tx,
+                    payload
+                } = cmd;
+                let result = downloader.handle_command(payload).await;
+                if response_tx.send(CommandResponse{result}).is_err() {
+                    // Caller went away, e.g. because an HTTP request timed out
+                    tracing::info!("Dropping response to administrative command")
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+struct SecondaryDownloader {
+    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    cancel: CancellationToken,
+}
+
+struct TenantJob {
+    tenant_id: TenantId,
+    secondary_state: Arc<SecondaryTenant>,
+
+    // This mutex guard conveys the right to write to the tenant's local directory: it must
+    // be taken before doing downloads, and TenantManager must ensure it has been released
+    // before it considers shutdown complete for the secondary state -- [`SecondaryDownloader`]
+    // will thereby never be racing with [`Tenant`] for access to local files.
+    _guard: tokio::sync::OwnedMutexGuard<()>,
+}
+
+impl SecondaryDownloader {
+    async fn iteration(&self) -> anyhow::Result<()> {
+        // Step 1: identify some tenants that we may work on
+        let mut candidates: Vec<TenantJob> = Vec::new();
+        self.tenant_manager
+            .foreach_secondary_tenants(|tenant_id, secondary_state| {
+                let guard = match secondary_state.busy.clone().try_lock_owned() {
+                    Ok(guard) => guard,
+                    // If we can't lock, someone is in the process of shutting it down, or we are
+                    // already working on it.  We may ignore it when scanning for new work to do.
+                    Err(_) => return,
+                };
+
+                candidates.push(TenantJob {
+                    tenant_id: *tenant_id,
+                    secondary_state: secondary_state.clone(),
+                    _guard: guard,
+                });
+            });
+
+        // Step 2: prioritized selection of next batch of tenants to freshen
+        let now = Instant::now();
+        let candidates = candidates.into_iter().filter(|c| {
+            let detail = c.secondary_state.detail.lock().unwrap();
+            match detail.freshened_at {
+                None => true, // Not yet freshened, therefore elegible to run
+                Some(t) => {
+                    let since = now.duration_since(t);
+                    since > DOWNLOAD_FRESHEN_INTERVAL
+                }
+            }
+        });
+
+        // TODO: don't just cut down the list, prioritize it to freshen the stalest tenants first
+        // TODO: bounded parallelism
+
+        // Step 3: spawn freshen_tenant tasks
+        for job in candidates {
+            if job.secondary_state.cancel.is_cancelled() {
+                continue;
+            }
+
+            async {
+                if let Err(e) = self.freshen_tenant(&job).await {
+                    tracing::info!("Failed to freshen secondary content: {e:#}")
+                };
+
+                // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
+                // take priority to run again.
+                let mut detail = job.secondary_state.detail.lock().unwrap();
+                detail.freshened_at = Some(Instant::now());
+            }
+            .instrument(tracing::info_span!(
+                "freshen_tenant",
+                tenant_id = %job.tenant_id
+            ))
+            .await;
+        }
+
+        Ok(())
+    }
+
+    async fn handle_command(&self, command: DownloadCommand) -> anyhow::Result<()> {
+        match command {
+            DownloadCommand::Download(req_tenant_id) => {
+                let mut candidates: Vec<TenantJob> = Vec::new();
+                self.tenant_manager
+                    .foreach_secondary_tenants(|tenant_id, secondary_state| {
+                        tracing::info!("foreach_secondary: {tenant_id} ({req_tenant_id})");
+                        if tenant_id == &req_tenant_id {
+                            let guard = match secondary_state.busy.clone().try_lock_owned() {
+                                Ok(guard) => guard,
+                                // If we can't lock, someone is in the process of shutting it down, or we are
+                                // already working on it.  We may ignore it when scanning for new work to do.
+                                Err(_) => return,
+                            };
+
+                            candidates.push(TenantJob {
+                                tenant_id: *tenant_id,
+                                secondary_state: secondary_state.clone(),
+                                _guard: guard,
+                            });
+                        }
+                    });
+
+                let tenant_job = if candidates.len() != 1 {
+                    anyhow::bail!("Tenant not found in secondary mode");
+                } else {
+                    candidates.pop().unwrap()
+                };
+
+                self.freshen_tenant(&tenant_job).await
+            }
+        }
+    }
+
+    async fn download_heatmap(&self, tenant_id: &TenantId) -> anyhow::Result<HeatMapTenant> {
+        // TODO: make download conditional on ETag having changed since last download
+
+        let heatmap_path = remote_heatmap_path(tenant_id);
+        // TODO: wrap this download in a select! that checks self.cancel
+        let mut download = self.remote_storage.download(&heatmap_path).await?;
+        let mut heatmap_bytes = Vec::new();
+        let _size = tokio::io::copy(&mut download.download_stream, &mut heatmap_bytes)
+            .await
+            .with_context(|| format!("download heatmap {heatmap_path:?}"))?;
+
+        Ok(serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?)
+    }
+
+    async fn init_timeline_state(
+        &self,
+        tenant_id: &TenantId,
+        timeline_id: &TimelineId,
+        heatmap: &HeatMapTimeline,
+    ) -> anyhow::Result<SecondaryDetailTimeline> {
+        let timeline_path = self.conf.timeline_path(tenant_id, timeline_id);
+        let mut detail = SecondaryDetailTimeline::default();
+
+        let mut dir = match tokio::fs::read_dir(&timeline_path).await {
+            Ok(d) => d,
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    tracing::info!("Creating timeline directory {timeline_path}");
+                    tokio::fs::create_dir(&timeline_path).await?;
+
+                    // No entries to report: drop out.
+                    return Ok(detail);
+                } else {
+                    return Err(e.into());
+                }
+            }
+        };
+
+        let heatmap_metadata: HashMap<_, _> = heatmap.layers.iter().map(|l| (&l.name, l)).collect();
+
+        while let Some(dentry) = dir.next_entry().await? {
+            let dentry_file_name = dentry.file_name();
+            let file_name = dentry_file_name.to_string_lossy();
+            let local_meta = dentry.metadata().await?;
+
+            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
+            if file_name == METADATA_FILE_NAME {
+                continue;
+            }
+
+            match LayerFileName::from_str(&file_name) {
+                Ok(name) => {
+                    let remote_meta = heatmap_metadata.get(&name);
+                    match remote_meta {
+                        Some(remote_meta) => {
+                            // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
+                            if local_meta.len() != remote_meta.metadata.file_size {
+                                // This should not happen, because we do crashsafe write-then-rename when downloading
+                                // layers, and layers in remote storage are immutable.  Remove the local file because
+                                // we cannot trust it.
+                                tracing::warn!("Removing local layer {name} with unexpected local size {} != {}",
+                                    local_meta.len(), remote_meta.metadata.file_size);
+                            } else {
+                                // We expect the access time to be initialized immediately afterwards, when
+                                // the latest heatmap is applied to the state.
+                                detail.on_disk_layers.insert(
+                                    name.clone(),
+                                    OnDiskState::new(
+                                        self.conf,
+                                        tenant_id,
+                                        timeline_id,
+                                        name,
+                                        LayerFileMetadata::from(&remote_meta.metadata),
+                                        remote_meta.access_time,
+                                    ),
+                                );
+                            }
+                        }
+                        None => {
+                            // FIXME: consider some optimization when transitioning from attached to secondary: maybe
+                            // wait until we have seen a heatmap that is more recent than the most recent on-disk state?  Otherwise
+                            // we will end up deleting any layers which were created+uploaded more recently than the heatmap.
+                            tracing::info!(
+                                "Removing secondary local layer {} because it's absent in heatmap",
+                                name
+                            );
+                            tokio::fs::remove_file(dentry.path()).await?;
+                        }
+                    }
+                }
+                Err(_) => {
+                    // Ignore it.
+                    tracing::warn!("Unexpected file in timeline directory: {file_name}");
+                }
+            }
+        }
+
+        Ok(detail)
+    }
+
+    async fn freshen_timeline(
+        &self,
+        job: &TenantJob,
+        timeline: HeatMapTimeline,
+    ) -> anyhow::Result<()> {
+        let timeline_path = self
+            .conf
+            .timeline_path(&job.tenant_id, &timeline.timeline_id);
+
+        // Accumulate updates to the state
+        let mut touched = Vec::new();
+
+        // Clone a view of what layers already exist on disk
+        let timeline_state = job
+            .secondary_state
+            .detail
+            .lock()
+            .unwrap()
+            .timelines
+            .get(&timeline.timeline_id)
+            .cloned();
+
+        let timeline_state = match timeline_state {
+            Some(t) => t,
+            None => {
+                // We have no existing state: need to scan local disk for layers first.
+                self.init_timeline_state(&job.tenant_id, &timeline.timeline_id, &timeline)
+                    .await?
+            }
+        };
+
+        let layers_in_heatmap = timeline
+            .layers
+            .iter()
+            .map(|l| &l.name)
+            .collect::<HashSet<_>>();
+        let layers_on_disk = timeline_state
+            .on_disk_layers
+            .iter()
+            .map(|l| l.0)
+            .collect::<HashSet<_>>();
+
+        // Remove on-disk layers that are no longer present in heatmap
+        for layer in layers_on_disk.difference(&layers_in_heatmap) {
+            let local_path = timeline_path.join(layer.to_string());
+            tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
+            tokio::fs::remove_file(&local_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)?;
+        }
+
+        // Download heatmap layers that are not present on local disk, or update their
+        // access time if they are already present.
+        for layer in timeline.layers {
+            if self.cancel.is_cancelled() {
+                return Ok(());
+            }
+
+            // Existing on-disk layers: just update their access time.
+            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
+                if on_disk.layer.metadata() != LayerFileMetadata::from(&layer.metadata)
+                    || on_disk.access_time != layer.access_time
+                {
+                    // We already have this layer on disk.  Update its access time.
+                    tracing::trace!(
+                        "Access time updated for layer {}: {} -> {}",
+                        layer.name,
+                        strftime(&on_disk.access_time),
+                        strftime(&layer.access_time)
+                    );
+                    touched.push(layer);
+                }
+                continue;
+            }
+
+            // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
+            // recently than it was evicted.
+            if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
+                if &layer.access_time > evicted_at {
+                    tracing::info!(
+                        "Re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                        layer.name,
+                        strftime(&layer.access_time),
+                        strftime(evicted_at)
+                    );
+                } else {
+                    tracing::trace!(
+                        "Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                        layer.name,
+                        strftime(&layer.access_time),
+                        strftime(evicted_at)
+                    );
+                    continue;
+                }
+            }
+
+            match download_layer_file(
+                self.conf,
+                &self.remote_storage,
+                job.tenant_id,
+                timeline.timeline_id,
+                &layer.name,
+                &LayerFileMetadata::from(&layer.metadata),
+            )
+            .await
+            {
+                Ok(downloaded_bytes) => {
+                    if downloaded_bytes != layer.metadata.file_size {
+                        let local_path = timeline_path.join(layer.name.to_string());
+
+                        tracing::error!(
+                            "Downloaded layer {} with unexpected size {} != {}",
+                            layer.name,
+                            downloaded_bytes,
+                            layer.metadata.file_size
+                        );
+
+                        tokio::fs::remove_file(&local_path)
+                            .await
+                            .or_else(fs_ext::ignore_not_found)?;
+                    }
+
+                    touched.push(layer)
+                }
+                Err(e) => {
+                    // No retries here: secondary downloads don't have to succeed: if they fail we just proceed and expect
+                    // that on some future call to freshen the download will work.
+                    // TODO: refine this behavior.
+                    tracing::info!("Failed to download layer {}: {}", layer.name, e);
+                }
+            }
+        }
+
+        // Write updates to state to record layers we just downloaded or touched.
+        {
+            let mut detail = job.secondary_state.detail.lock().unwrap();
+            let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
+
+            for t in touched {
+                use std::collections::hash_map::Entry;
+                match timeline_detail.on_disk_layers.entry(t.name.clone()) {
+                    Entry::Occupied(mut v) => {
+                        v.get_mut().access_time = t.access_time;
+                    }
+                    Entry::Vacant(e) => {
+                        e.insert(OnDiskState::new(
+                            self.conf,
+                            &job.tenant_id,
+                            &timeline.timeline_id,
+                            t.name,
+                            LayerFileMetadata::from(&t.metadata),
+                            t.access_time,
+                        ));
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn freshen_tenant(&self, job: &TenantJob) -> anyhow::Result<()> {
+        // Download the tenant's heatmap
+        let heatmap = self.download_heatmap(&job.tenant_id).await?;
+
+        // Download the layers in the heatmap
+        for timeline in heatmap.timelines {
+            if self.cancel.is_cancelled() {
+                return Ok(());
+            }
+
+            self.freshen_timeline(job, timeline).await?;
+        }
+
+        Ok(())
+    }
+}
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -0,0 +1,57 @@
+use std::time::SystemTime;
+
+use crate::tenant::{
+    remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName,
+};
+
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+
+use utils::id::TimelineId;
+
+#[derive(Serialize, Deserialize)]
+pub(super) struct HeatMapTenant {
+    pub(super) timelines: Vec<HeatMapTimeline>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub(crate) struct HeatMapLayer {
+    pub(super) name: LayerFileName,
+    pub(super) metadata: IndexLayerMetadata,
+
+    pub(super) access_time: SystemTime,
+    // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
+    // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
+}
+
+impl HeatMapLayer {
+    pub(crate) fn new(
+        name: LayerFileName,
+        metadata: IndexLayerMetadata,
+        access_time: SystemTime,
+    ) -> Self {
+        Self {
+            name,
+            metadata,
+            access_time,
+        }
+    }
+}
+
+#[serde_as]
+#[derive(Serialize, Deserialize)]
+pub(crate) struct HeatMapTimeline {
+    #[serde_as(as = "DisplayFromStr")]
+    pub(super) timeline_id: TimelineId,
+
+    pub(super) layers: Vec<HeatMapLayer>,
+}
+
+impl HeatMapTimeline {
+    pub(crate) fn new(timeline_id: TimelineId, layers: Vec<HeatMapLayer>) -> Self {
+        Self {
+            timeline_id,
+            layers,
+        }
+    }
+}
--- a/pageserver/src/tenant/secondary/heatmap_writer.rs
+++ b/pageserver/src/tenant/secondary/heatmap_writer.rs
@@ -0,0 +1,207 @@
+use std::{collections::HashMap, sync::Arc, time::Duration};
+
+use crate::tenant::{
+    mgr::TenantManager, remote_timeline_client::remote_heatmap_path, secondary::CommandResponse,
+    Tenant,
+};
+
+use pageserver_api::models::TenantState;
+use remote_storage::GenericRemoteStorage;
+
+use tokio_util::sync::CancellationToken;
+use tracing::Instrument;
+use utils::{backoff, completion::Barrier};
+
+use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
+
+const HEATMAP_UPLOAD_INTERVAL: Duration = Duration::from_millis(60000);
+
+pub(super) async fn heatmap_writer_task(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) -> anyhow::Result<()> {
+    let writer = HeatmapWriter {
+        tenant_manager,
+        remote_storage,
+        cancel: cancel.clone(),
+    };
+
+    tracing::info!("Waiting for background_jobs_can start...");
+    background_jobs_can_start.wait().await;
+    tracing::info!("background_jobs_can is ready, proceeding.");
+
+    while !cancel.is_cancelled() {
+        writer.iteration().await?;
+
+        tokio::select! {
+            _ = cancel.cancelled() => {
+                tracing::info!("Heatmap writer terminating");
+                break;
+            },
+            _ = tokio::time::sleep(HEATMAP_UPLOAD_INTERVAL) => {},
+            cmd = command_queue.recv() => {
+                let cmd = match cmd {
+                    Some(c) =>c,
+                    None => {
+                        // SecondaryController was destroyed, and this has raced with
+                        // our CancellationToken
+                        tracing::info!("Heatmap writer terminating");
+                        break;
+                    }
+                };
+
+                let CommandRequest{
+                    response_tx,
+                    payload
+                } = cmd;
+                let result = writer.handle_command(payload).await;
+                if response_tx.send(CommandResponse{result}).is_err() {
+                    // Caller went away, e.g. because an HTTP request timed out
+                    tracing::info!("Dropping response to administrative command")
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+struct HeatmapWriter {
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    cancel: CancellationToken,
+}
+
+impl HeatmapWriter {
+    async fn iteration(&self) -> anyhow::Result<()> {
+        let tenants = self.tenant_manager.get_attached_tenants();
+
+        for tenant in tenants {
+            if self.cancel.is_cancelled() {
+                return Ok(());
+            }
+
+            if tenant.current_state() != TenantState::Active {
+                continue;
+            }
+
+            // TODO: add a mechanism to check whether the active layer set has
+            // changed since our last write
+
+            // TODO: add a minimum time between uploads
+
+            match self
+                .write_tenant(&tenant)
+                .instrument(tracing::info_span!(
+                    "write_tenant",
+                    tenant_id = %tenant.get_tenant_id()
+                ))
+                .await
+            {
+                Ok(()) => {}
+                Err(e) => {
+                    tracing::warn!(
+                        "Failed to upload heatmap for tenant {}: {e:#}",
+                        tenant.get_tenant_id(),
+                    )
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn handle_command(&self, command: UploadCommand) -> anyhow::Result<()> {
+        match command {
+            UploadCommand::Upload(tenant_id) => {
+                let tenants = self.tenant_manager.get_attached_tenants();
+
+                let map = tenants
+                    .iter()
+                    .map(|t| (t.get_tenant_id(), t))
+                    .collect::<HashMap<_, _>>();
+                match map.get(&tenant_id) {
+                    Some(tenant) => self.write_tenant(tenant).await,
+                    None => {
+                        anyhow::bail!("Tenant is not attached");
+                    }
+                }
+            }
+        }
+    }
+
+    async fn write_tenant(&self, tenant: &Arc<Tenant>) -> anyhow::Result<()> {
+        let mut heatmap = HeatMapTenant {
+            timelines: Vec::new(),
+        };
+        let timelines = tenant.timelines.lock().unwrap().clone();
+
+        let tenant_cancel = tenant.cancel.clone();
+
+        // Ensure that Tenant::shutdown waits for any upload in flight
+        let _guard = {
+            let hook = tenant.heatmap_hook.lock().unwrap();
+            match hook.enter() {
+                Some(g) => g,
+                None => {
+                    // Tenant is shutting down
+                    tracing::info!("Skipping, tenant is shutting down");
+                    return Ok(());
+                }
+            }
+        };
+
+        for (timeline_id, timeline) in timelines {
+            let heatmap_timeline = timeline.generate_heatmap().await;
+            match heatmap_timeline {
+                None => {
+                    tracing::debug!(
+                        "Skipping heatmap upload because timeline {timeline_id} is not ready"
+                    );
+                    return Ok(());
+                }
+                Some(heatmap_timeline) => {
+                    heatmap.timelines.push(heatmap_timeline);
+                }
+            }
+        }
+
+        // Serialize the heatmap
+        let bytes = serde_json::to_vec(&heatmap)?;
+        let size = bytes.len();
+
+        let path = remote_heatmap_path(&tenant.get_tenant_id());
+
+        // Write the heatmap.
+        tracing::debug!("Uploading {size} byte heatmap to {path}");
+        if let Err(e) = backoff::retry(
+            || async {
+                let bytes = tokio::io::BufReader::new(std::io::Cursor::new(bytes.clone()));
+                let bytes = Box::new(bytes);
+                self.remote_storage
+                    .upload_storage_object(bytes, size, &path)
+                    .await
+            },
+            |_| false,
+            3,
+            u32::MAX,
+            "Uploading heatmap",
+            backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")),
+        )
+        .await
+        {
+            if tenant_cancel.is_cancelled() {
+                return Ok(());
+            } else {
+                return Err(e);
+            }
+        }
+
+        tracing::info!("Successfully uploading {size} byte heatmap to {path}");
+
+        Ok(())
+    }
+}
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,26 +4,21 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
+mod layer;
 mod layer_desc;
-mod remote_layer;

-use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
-use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
-use anyhow::Result;
 use bytes::Bytes;
-use camino::Utf8PathBuf;
 use enum_map::EnumMap;
 use enumset::EnumSet;
 use once_cell::sync::Lazy;
-use pageserver_api::models::LayerAccessKind;
 use pageserver_api::models::{
-    HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
+    LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
 use std::ops::Range;
-use std::sync::{Arc, Mutex};
+use std::sync::Mutex;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -39,7 +34,8 @@ pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
-pub use remote_layer::RemoteLayer;
+
+pub(crate) use layer::{EvictionError, Layer, ResidentLayer};

 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
@@ -74,7 +70,7 @@ pub struct ValueReconstructState {
    pub img: Option<(Lsn, Bytes)>,
 }

-/// Return value from Layer::get_page_reconstruct_data
+/// Return value from [`Layer::get_value_reconstruct_data`]
 #[derive(Clone, Copy, Debug)]
 pub enum ValueReconstructResult {
    /// Got all the data needed to reconstruct the requested page
@@ -179,26 +175,6 @@ impl LayerAccessStats {
        new
    }

-    /// Creates a clone of `self` and records `new_status` in the clone.
-    ///
-    /// The `new_status` is not recorded in `self`.
-    ///
-    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
-    pub(crate) fn clone_for_residence_change(
-        &self,
-        new_status: LayerResidenceStatus,
-    ) -> LayerAccessStats {
-        let clone = {
-            let inner = self.0.lock().unwrap();
-            inner.clone()
-        };
-        let new = LayerAccessStats(Mutex::new(clone));
-        new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
-        new
-    }
-
    /// Record a change in layer residency.
    ///
    /// Recording the event must happen while holding the layer map lock to
@@ -321,95 +297,12 @@ impl LayerAccessStats {
    }
 }

-/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
-/// required by [`LayerMap`](super::layer_map::LayerMap).
-///
-/// All layers should implement a minimal `std::fmt::Debug` without tenant or
-/// timeline names, because those are known in the context of which the layers
-/// are used in (timeline).
-#[async_trait::async_trait]
-pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
-    ///
-    /// Return data needed to reconstruct given page at LSN.
-    ///
-    /// It is up to the caller to collect more data from previous layer and
-    /// perform WAL redo, if necessary.
-    ///
-    /// See PageReconstructResult for possible return values. The collected data
-    /// is appended to reconstruct_data; the caller should pass an empty struct
-    /// on first call, or a struct with a cached older image of the page if one
-    /// is available. If this returns ValueReconstructResult::Continue, look up
-    /// the predecessor layer and call again with the same 'reconstruct_data' to
-    /// collect more data.
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult>;
-}
-
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
    fn layer_desc(&self) -> &PersistentLayerDesc;
 }

-/// A Layer contains all data in a "rectangle" consisting of a range of keys and
-/// range of LSNs.
-///
-/// There are two kinds of layers, in-memory and on-disk layers. In-memory
-/// layers are used to ingest incoming WAL, and provide fast access to the
-/// recent page versions. On-disk layers are stored as files on disk, and are
-/// immutable. This trait presents the common functionality of in-memory and
-/// on-disk layers.
-///
-/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
-/// A delta layer contains all modifications within a range of LSNs and keys.
-/// An image layer is a snapshot of all the data in a key-range, at a single
-/// LSN.
-pub trait PersistentLayer: Layer + AsLayerDesc {
-    /// File name used for this layer, both in the pageserver's local filesystem
-    /// state as well as in the remote storage.
-    fn filename(&self) -> LayerFileName {
-        self.layer_desc().filename()
-    }
-
-    // Path to the layer file in the local filesystem.
-    // `None` for `RemoteLayer`.
-    fn local_path(&self) -> Option<Utf8PathBuf>;
-
-    /// Permanently remove this layer from disk.
-    fn delete_resident_layer_file(&self) -> Result<()>;
-
-    fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
-        None
-    }
-
-    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
-        None
-    }
-
-    fn is_remote_layer(&self) -> bool {
-        false
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
-
-    fn access_stats(&self) -> &LayerAccessStats;
-}
-
-pub fn downcast_remote_layer(
-    layer: &Arc<dyn PersistentLayer>,
-) -> Option<std::sync::Arc<RemoteLayer>> {
-    if layer.is_remote_layer() {
-        Arc::clone(layer).downcast_remote_layer()
-    } else {
-        None
-    }
-}
-
 pub mod tests {
    use super::*;

@@ -447,19 +340,6 @@ pub mod tests {
    }
 }

-/// Helper enum to hold a PageServerConf, or a path
-///
-/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
-/// global config, and paths to layer files are constructed using the tenant/timeline
-/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
-/// struct for a file on disk, without having a page server running, so that we have no
-/// config. In that case, we use the Path variant to hold the full path to the file on
-/// disk.
-enum PathOrConf {
-    Path(Utf8PathBuf),
-    Conf(&'static PageServerConf),
-}
-
 /// Range wrapping newtype, which uses display to render Debug.
 ///
 /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -34,18 +34,17 @@ use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::storage_layer::{
-    PersistentLayer, ValueReconstructResult, ValueReconstructState,
-};
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::Timeline;
 use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
+use pageserver_api::models::LayerAccessKind;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::{self, File};
+use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
@@ -59,10 +58,7 @@ use utils::{
    lsn::Lsn,
 };

-use super::{
-    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
-    PersistentLayerDesc,
-};
+use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};

 ///
 /// Header stored in the beginning of the file
@@ -182,20 +178,12 @@ impl DeltaKey {
    }
 }

-/// DeltaLayer is the in-memory data structure associated with an on-disk delta
-/// file.
-///
-/// We keep a DeltaLayer in memory for each file, in the LayerMap. If a layer
-/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
-/// Otherwise the struct is just a placeholder for a file that exists on disk,
-/// and it needs to be loaded before using it in queries.
+/// This is used only from `pagectl`. Within pageserver, all layers are
+/// [`crate::tenant::storage_layer::Layer`], which can hold a [`DeltaLayerInner`].
 pub struct DeltaLayer {
-    path_or_conf: PathOrConf,
-
+    path: Utf8PathBuf,
    pub desc: PersistentLayerDesc,
-
    access_stats: LayerAccessStats,
-
    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

@@ -212,6 +200,8 @@ impl std::fmt::Debug for DeltaLayer {
    }
 }

+/// `DeltaLayerInner` is the in-memory data structure associated with an on-disk delta
+/// file.
 pub struct DeltaLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -221,12 +211,6 @@ pub struct DeltaLayerInner {
    file: FileBlockReader,
 }

-impl AsRef<DeltaLayerInner> for DeltaLayerInner {
-    fn as_ref(&self) -> &DeltaLayerInner {
-        self
-    }
-}
-
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
@@ -236,19 +220,6 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

-#[async_trait::async_trait]
-impl Layer for DeltaLayer {
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
-            .await
-    }
-}
 /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
 impl std::fmt::Display for DeltaLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -262,40 +233,9 @@ impl AsLayerDesc for DeltaLayer {
    }
 }

-impl PersistentLayer for DeltaLayer {
-    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
-        Some(self)
-    }
-
-    fn local_path(&self) -> Option<Utf8PathBuf> {
-        self.local_path()
-    }
-
-    fn delete_resident_layer_file(&self) -> Result<()> {
-        self.delete_resident_layer_file()
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        self.info(reset)
-    }
-
-    fn access_stats(&self) -> &LayerAccessStats {
-        self.access_stats()
-    }
-}
-
 impl DeltaLayer {
    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
-            self.desc.tenant_id,
-            self.desc.timeline_id,
-            self.desc.key_range.start,
-            self.desc.key_range.end,
-            self.desc.lsn_range.start,
-            self.desc.lsn_range.end,
-            self.desc.file_size,
-        );
+        self.desc.dump();

        if !verbose {
            return Ok(());
@@ -303,119 +243,7 @@ impl DeltaLayer {

        let inner = self.load(LayerAccessKind::Dump, ctx).await?;

-        println!(
-            "index_start_blk: {}, root {}",
-            inner.index_start_blk, inner.index_root_blk
-        );
-
-        let file = &inner.file;
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            inner.index_start_blk,
-            inner.index_root_blk,
-            file,
-        );
-
-        tree_reader.dump().await?;
-
-        let keys = DeltaLayerInner::load_keys(&inner, ctx).await?;
-
-        // A subroutine to dump a single blob
-        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> Result<String> {
-            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
-            let val = Value::des(&buf)?;
-            let desc = match val {
-                Value::Image(img) => {
-                    format!(" img {} bytes", img.len())
-                }
-                Value::WalRecord(rec) => {
-                    let wal_desc = walrecord::describe_wal_record(&rec)?;
-                    format!(
-                        " rec {} bytes will_init: {} {}",
-                        buf.len(),
-                        rec.will_init(),
-                        wal_desc
-                    )
-                }
-            };
-            Ok(desc)
-        }
-
-        for entry in keys {
-            let DeltaEntry { key, lsn, val, .. } = entry;
-            let desc = match dump_blob(val, ctx).await {
-                Ok(desc) => desc,
-                Err(err) => {
-                    let err: anyhow::Error = err;
-                    format!("ERROR: {err}")
-                }
-            };
-            println!("  key {key} at {lsn}: {desc}");
-        }
-
-        Ok(())
-    }
-
-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        ensure!(lsn_range.start >= self.desc.lsn_range.start);
-
-        ensure!(self.desc.key_range.contains(&key));
-
-        let inner = self
-            .load(LayerAccessKind::GetValueReconstructData, ctx)
-            .await?;
-        inner
-            .get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
-            .await
-    }
-
-    pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
-        Some(self.path())
-    }
-
-    pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
-        // delete underlying file
-        fs::remove_file(self.path())?;
-        Ok(())
-    }
-
-    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.layer_desc().filename().file_name();
-        let lsn_range = self.layer_desc().lsn_range.clone();
-
-        let access_stats = self.access_stats.as_api_model(reset);
-
-        HistoricLayerInfo::Delta {
-            layer_file_name,
-            layer_file_size: self.desc.file_size,
-            lsn_start: lsn_range.start,
-            lsn_end: lsn_range.end,
-            remote: false,
-            access_stats,
-        }
-    }
-
-    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
-        &self.access_stats
-    }
-
-    fn path_for(
-        path_or_conf: &PathOrConf,
-        tenant_id: &TenantId,
-        timeline_id: &TimelineId,
-        fname: &DeltaFileName,
-    ) -> Utf8PathBuf {
-        match path_or_conf {
-            PathOrConf::Path(path) => path.clone(),
-            PathOrConf::Conf(conf) => conf
-                .timeline_path(tenant_id, timeline_id)
-                .join(fname.to_string()),
-        }
+        inner.dump(ctx).await
    }

    fn temp_path_for(
@@ -461,52 +289,21 @@ impl DeltaLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let summary = match &self.path_or_conf {
-            PathOrConf::Conf(_) => Some(Summary::from(self)),
-            PathOrConf::Path(_) => None,
-        };
+        let loaded = DeltaLayerInner::load(&path, None, ctx).await?;

-        let loaded = DeltaLayerInner::load(&path, summary, ctx).await?;
+        // not production code
+        let actual_filename = path.file_name().unwrap().to_owned();
+        let expected_filename = self.layer_desc().filename().file_name();

-        if let PathOrConf::Path(ref path) = self.path_or_conf {
-            // not production code
-
-            let actual_filename = path.file_name().unwrap().to_owned();
-            let expected_filename = self.filename().file_name();
-
-            if actual_filename != expected_filename {
-                println!("warning: filename does not match what is expected from in-file summary");
-                println!("actual: {:?}", actual_filename);
-                println!("expected: {:?}", expected_filename);
-            }
+        if actual_filename != expected_filename {
+            println!("warning: filename does not match what is expected from in-file summary");
+            println!("actual: {:?}", actual_filename);
+            println!("expected: {:?}", expected_filename);
        }

        Ok(Arc::new(loaded))
    }

-    /// Create a DeltaLayer struct representing an existing file on disk.
-    pub fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        filename: &DeltaFileName,
-        file_size: u64,
-        access_stats: LayerAccessStats,
-    ) -> DeltaLayer {
-        DeltaLayer {
-            path_or_conf: PathOrConf::Conf(conf),
-            desc: PersistentLayerDesc::new_delta(
-                tenant_id,
-                timeline_id,
-                filename.key_range.clone(),
-                filename.lsn_range.clone(),
-                file_size,
-            ),
-            access_stats,
-            inner: OnceCell::new(),
-        }
-    }
-
    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
@@ -520,7 +317,7 @@ impl DeltaLayer {
            .context("get file metadata to determine size")?;

        Ok(DeltaLayer {
-            path_or_conf: PathOrConf::Path(path.to_path_buf()),
+            path: path.to_path_buf(),
            desc: PersistentLayerDesc::new_delta(
                summary.tenant_id,
                summary.timeline_id,
@@ -533,29 +330,9 @@ impl DeltaLayer {
        })
    }

-    fn layer_name(&self) -> DeltaFileName {
-        self.desc.delta_file_name()
-    }
    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> Utf8PathBuf {
-        Self::path_for(
-            &self.path_or_conf,
-            &self.desc.tenant_id,
-            &self.desc.timeline_id,
-            &self.layer_name(),
-        )
-    }
-    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
-    ///
-    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
-        let inner = self
-            .load(LayerAccessKind::KeyIter, ctx)
-            .await
-            .context("load delta layer keys")?;
-        DeltaLayerInner::load_keys(inner, ctx)
-            .await
-            .context("Layer index is corrupted")
+    fn path(&self) -> Utf8PathBuf {
+        self.path.clone()
    }
 }

@@ -660,7 +437,7 @@ impl DeltaLayerWriterInner {
    ///
    /// Finish writing the delta layer.
    ///
-    async fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+    async fn finish(self, key_end: Key, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -717,37 +494,21 @@ impl DeltaLayerWriterInner {
        // Note: Because we opened the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
-        let layer = DeltaLayer {
-            path_or_conf: PathOrConf::Conf(self.conf),
-            desc: PersistentLayerDesc::new_delta(
-                self.tenant_id,
-                self.timeline_id,
-                self.key_start..key_end,
-                self.lsn_range.clone(),
-                metadata.len(),
-            ),
-            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
-        };
+
+        let desc = PersistentLayerDesc::new_delta(
+            self.tenant_id,
+            self.timeline_id,
+            self.key_start..key_end,
+            self.lsn_range.clone(),
+            metadata.len(),
+        );

        // fsync the file
        file.sync_all().await?;
-        // Rename the file to its final name
-        //
-        // Note: This overwrites any existing file. There shouldn't be any.
-        // FIXME: throw an error instead?
-        let final_path = DeltaLayer::path_for(
-            &PathOrConf::Conf(self.conf),
-            &self.tenant_id,
-            &self.timeline_id,
-            &DeltaFileName {
-                key_range: self.key_start..key_end,
-                lsn_range: self.lsn_range,
-            },
-        );
-        std::fs::rename(self.path, &final_path)?;

-        trace!("created delta layer {final_path}");
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+
+        trace!("created delta layer {}", layer.local_path());

        Ok(layer)
    }
@@ -828,8 +589,12 @@ impl DeltaLayerWriter {
    ///
    /// Finish writing the delta layer.
    ///
-    pub async fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
-        self.inner.take().unwrap().finish(key_end).await
+    pub(crate) async fn finish(
+        mut self,
+        key_end: Key,
+        timeline: &Arc<Timeline>,
+    ) -> anyhow::Result<ResidentLayer> {
+        self.inner.take().unwrap().finish(key_end, timeline).await
    }
 }

@@ -967,15 +732,17 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_keys<'a, 'b, T: AsRef<DeltaLayerInner> + Clone>(
-        this: &'a T,
-        ctx: &'b RequestContext,
+    pub(super) async fn load_keys<'a>(
+        &'a self,
+        ctx: &RequestContext,
    ) -> Result<Vec<DeltaEntry<'a>>> {
-        let dl = this.as_ref();
-        let file = &dl.file;
+        let file = &self.file;

-        let tree_reader =
-            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            file,
+        );

        let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();

@@ -988,7 +755,7 @@ impl DeltaLayerInner {
                    let val_ref = ValueRef {
                        blob_ref: BlobRef(value),
                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
-                            Adapter(dl),
+                            Adapter(self),
                        )),
                    };
                    let pos = BlobRef(value).pos();
@@ -1015,10 +782,61 @@ impl DeltaLayerInner {
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of value storage,
            // which corresponds to beginning of the index
-            last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
+            last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
        }
        Ok(all_keys)
    }
+
+    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+        println!(
+            "index_start_blk: {}, root {}",
+            self.index_start_blk, self.index_root_blk
+        );
+
+        let file = &self.file;
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            file,
+        );
+
+        tree_reader.dump().await?;
+
+        let keys = self.load_keys(ctx).await?;
+
+        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
+            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
+            let val = Value::des(&buf)?;
+            let desc = match val {
+                Value::Image(img) => {
+                    format!(" img {} bytes", img.len())
+                }
+                Value::WalRecord(rec) => {
+                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    format!(
+                        " rec {} bytes will_init: {} {}",
+                        buf.len(),
+                        rec.will_init(),
+                        wal_desc
+                    )
+                }
+            };
+            Ok(desc)
+        }
+
+        for entry in keys {
+            let DeltaEntry { key, lsn, val, .. } = entry;
+            let desc = match dump_blob(val, ctx).await {
+                Ok(desc) => desc,
+                Err(err) => {
+                    format!("ERROR: {err}")
+                }
+            };
+            println!("  key {key} at {lsn}: {desc}");
+        }
+
+        Ok(())
+    }
 }

 /// A set of data associated with a delta layer key and its value
@@ -1058,3 +876,9 @@ impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
        self.0.as_ref().file.read_blk(blknum, ctx).await
    }
 }
+
+impl AsRef<DeltaLayerInner> for DeltaLayerInner {
+    fn as_ref(&self) -> &DeltaLayerInner {
+        self
+    }
+}
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -31,21 +31,23 @@ use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
-    LayerAccessStats, PersistentLayer, ValueReconstructResult, ValueReconstructState,
+    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
+use crate::tenant::Timeline;
 use crate::virtual_file::VirtualFile;
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
-use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
+use pageserver_api::models::LayerAccessKind;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::{self, File};
+use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
+use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tracing::*;

@@ -56,7 +58,7 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
+use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};

 ///
 /// Header stored in the beginning of the file
@@ -114,22 +116,14 @@ impl Summary {
    }
 }

-/// ImageLayer is the in-memory data structure associated with an on-disk image
-/// file.
-///
-/// We keep an ImageLayer in memory for each file, in the LayerMap. If a layer
-/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
-/// Otherwise the struct is just a placeholder for a file that exists on disk,
-/// and it needs to be loaded before using it in queries.
+/// This is used only from `pagectl`. Within pageserver, all layers are
+/// [`crate::tenant::storage_layer::Layer`], which can hold an [`ImageLayerInner`].
 pub struct ImageLayer {
-    path_or_conf: PathOrConf,
-
+    path: Utf8PathBuf,
    pub desc: PersistentLayerDesc,
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
-
    access_stats: LayerAccessStats,
-
    inner: OnceCell<ImageLayerInner>,
 }

@@ -146,6 +140,8 @@ impl std::fmt::Debug for ImageLayer {
    }
 }

+/// ImageLayer is the in-memory data structure associated with an on-disk image
+/// file.
 pub struct ImageLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -166,73 +162,11 @@ impl std::fmt::Debug for ImageLayerInner {
    }
 }

-#[async_trait::async_trait]
-impl Layer for ImageLayer {
-    /// Look up given page in the file
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
-            .await
-    }
-}
-
-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for ImageLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
-    }
-}
-
-impl AsLayerDesc for ImageLayer {
-    fn layer_desc(&self) -> &PersistentLayerDesc {
-        &self.desc
-    }
-}
-
-impl PersistentLayer for ImageLayer {
-    fn local_path(&self) -> Option<Utf8PathBuf> {
-        self.local_path()
-    }
-
-    fn delete_resident_layer_file(&self) -> Result<()> {
-        self.delete_resident_layer_file()
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        self.info(reset)
-    }
-
-    fn access_stats(&self) -> &LayerAccessStats {
-        self.access_stats()
-    }
-}
-
-impl ImageLayer {
-    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
-            self.desc.tenant_id,
-            self.desc.timeline_id,
-            self.desc.key_range.start,
-            self.desc.key_range.end,
-            self.lsn,
-            self.desc.is_incremental(),
-            self.desc.file_size
-        );
-
-        if !verbose {
-            return Ok(());
-        }
-
-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
-        let file = &inner.file;
+impl ImageLayerInner {
+    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+        let file = &self.file;
        let tree_reader =
-            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
+            DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);

        tree_reader.dump().await?;

@@ -250,69 +184,36 @@ impl ImageLayer {

        Ok(())
    }
+}

-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        assert!(self.desc.key_range.contains(&key));
-        assert!(lsn_range.start >= self.lsn);
-        assert!(lsn_range.end >= self.lsn);
-
-        let inner = self
-            .load(LayerAccessKind::GetValueReconstructData, ctx)
-            .await?;
-        inner
-            .get_value_reconstruct_data(key, reconstruct_state, ctx)
-            .await
-            // FIXME: makes no sense to dump paths
-            .with_context(|| format!("read {}", self.path()))
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for ImageLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
    }
+}

-    pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
-        Some(self.path())
+impl AsLayerDesc for ImageLayer {
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
    }
+}
+
+impl ImageLayer {
+    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+        self.desc.dump();
+
+        if !verbose {
+            return Ok(());
+        }
+
+        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+
+        inner.dump(ctx).await?;

-    pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
-        // delete underlying file
-        fs::remove_file(self.path())?;
        Ok(())
    }

-    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.layer_desc().filename().file_name();
-        let lsn_start = self.layer_desc().image_layer_lsn();
-
-        HistoricLayerInfo::Image {
-            layer_file_name,
-            layer_file_size: self.desc.file_size,
-            lsn_start,
-            remote: false,
-            access_stats: self.access_stats.as_api_model(reset),
-        }
-    }
-
-    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
-        &self.access_stats
-    }
-
-    fn path_for(
-        path_or_conf: &PathOrConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        fname: &ImageFileName,
-    ) -> Utf8PathBuf {
-        match path_or_conf {
-            PathOrConf::Path(path) => path.to_path_buf(),
-            PathOrConf::Conf(conf) => conf
-                .timeline_path(&tenant_id, &timeline_id)
-                .join(fname.to_string()),
-        }
-    }
-
    fn temp_path_for(
        conf: &PageServerConf,
        timeline_id: TimelineId,
@@ -348,54 +249,21 @@ impl ImageLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

-        let expected_summary = match &self.path_or_conf {
-            PathOrConf::Conf(_) => Some(Summary::from(self)),
-            PathOrConf::Path(_) => None,
-        };
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx).await?;

-        let loaded =
-            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary, ctx)
-                .await?;
+        // not production code
+        let actual_filename = path.file_name().unwrap().to_owned();
+        let expected_filename = self.layer_desc().filename().file_name();

-        if let PathOrConf::Path(ref path) = self.path_or_conf {
-            // not production code
-            let actual_filename = path.file_name().unwrap().to_owned();
-            let expected_filename = self.filename().file_name();
-
-            if actual_filename != expected_filename {
-                println!("warning: filename does not match what is expected from in-file summary");
-                println!("actual: {:?}", actual_filename);
-                println!("expected: {:?}", expected_filename);
-            }
+        if actual_filename != expected_filename {
+            println!("warning: filename does not match what is expected from in-file summary");
+            println!("actual: {:?}", actual_filename);
+            println!("expected: {:?}", expected_filename);
        }

        Ok(loaded)
    }

-    /// Create an ImageLayer struct representing an existing file on disk
-    pub fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        filename: &ImageFileName,
-        file_size: u64,
-        access_stats: LayerAccessStats,
-    ) -> ImageLayer {
-        ImageLayer {
-            path_or_conf: PathOrConf::Conf(conf),
-            desc: PersistentLayerDesc::new_img(
-                tenant_id,
-                timeline_id,
-                filename.key_range.clone(),
-                filename.lsn,
-                file_size,
-            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
-            lsn: filename.lsn,
-            access_stats,
-            inner: OnceCell::new(),
-        }
-    }
-
    /// Create an ImageLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
@@ -407,7 +275,7 @@ impl ImageLayer {
            .metadata()
            .context("get file metadata to determine size")?;
        Ok(ImageLayer {
-            path_or_conf: PathOrConf::Path(path.to_path_buf()),
+            path: path.to_path_buf(),
            desc: PersistentLayerDesc::new_img(
                summary.tenant_id,
                summary.timeline_id,
@@ -421,18 +289,8 @@ impl ImageLayer {
        })
    }

-    fn layer_name(&self) -> ImageFileName {
-        self.desc.image_file_name()
-    }
-
-    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> Utf8PathBuf {
-        Self::path_for(
-            &self.path_or_conf,
-            self.desc.timeline_id,
-            self.desc.tenant_id,
-            &self.layer_name(),
-        )
+    fn path(&self) -> Utf8PathBuf {
+        self.path.clone()
    }
 }

@@ -604,7 +462,7 @@ impl ImageLayerWriterInner {
    ///
    /// Finish writing the image layer.
    ///
-    async fn finish(self) -> anyhow::Result<ImageLayer> {
+    async fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -658,33 +516,14 @@ impl ImageLayerWriterInner {
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
-        let layer = ImageLayer {
-            path_or_conf: PathOrConf::Conf(self.conf),
-            desc,
-            lsn: self.lsn,
-            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
-        };

        // fsync the file
        file.sync_all().await?;

-        // Rename the file to its final name
-        //
-        // Note: This overwrites any existing file. There shouldn't be any.
-        // FIXME: throw an error instead?
-        let final_path = ImageLayer::path_for(
-            &PathOrConf::Conf(self.conf),
-            self.timeline_id,
-            self.tenant_id,
-            &ImageFileName {
-                key_range: self.key_range.clone(),
-                lsn: self.lsn,
-            },
-        );
-        std::fs::rename(self.path, final_path)?;
+        // FIXME: why not carry the virtualfile here, it supports renaming?
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        trace!("created image layer {}", layer.path());
+        trace!("created image layer {}", layer.local_path());

        Ok(layer)
    }
@@ -746,8 +585,11 @@ impl ImageLayerWriter {
    ///
    /// Finish writing the image layer.
    ///
-    pub async fn finish(mut self) -> anyhow::Result<ImageLayer> {
-        self.inner.take().unwrap().finish().await
+    pub(crate) async fn finish(
+        mut self,
+        timeline: &Arc<Timeline>,
+    ) -> anyhow::Result<super::ResidentLayer> {
+        self.inner.take().unwrap().finish(timeline).await
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,11 +10,12 @@ use crate::repository::{Key, Value};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
+use crate::tenant::Timeline;
 use crate::walrecord;
 use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
 use std::collections::HashMap;
-use std::sync::OnceLock;
+use std::sync::{Arc, OnceLock};
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
@@ -28,7 +29,7 @@ use std::fmt::Write as _;
 use std::ops::Range;
 use tokio::sync::RwLock;

-use super::{DeltaLayer, DeltaLayerWriter, Layer};
+use super::{DeltaLayerWriter, ResidentLayer};

 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
@@ -207,20 +208,6 @@ impl InMemoryLayer {
    }
 }

-#[async_trait::async_trait]
-impl Layer for InMemoryLayer {
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult> {
-        self.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
-            .await
-    }
-}
-
 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let end_lsn = self.end_lsn_or_max();
@@ -229,17 +216,13 @@ impl std::fmt::Display for InMemoryLayer {
 }

 impl InMemoryLayer {
-    ///
    /// Get layer size.
-    ///
    pub async fn size(&self) -> Result<u64> {
        let inner = self.inner.read().await;
        Ok(inner.file.len())
    }

-    ///
    /// Create a new, empty, in-memory layer
-    ///
    pub async fn create(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
@@ -331,7 +314,11 @@ impl InMemoryLayer {
    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub(crate) async fn write_to_disk(&self, ctx: &RequestContext) -> Result<DeltaLayer> {
+    pub(crate) async fn write_to_disk(
+        &self,
+        timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> Result<ResidentLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -376,7 +363,8 @@ impl InMemoryLayer {
            }
        }

-        let delta_layer = delta_layer_writer.finish(Key::MAX).await?;
+        // MAX is used here because we identify L0 layers by full key range
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
        Ok(delta_layer)
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,4 +1,3 @@
-use anyhow::Result;
 use core::fmt::Display;
 use std::ops::Range;
 use utils::{
@@ -6,7 +5,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::{context::RequestContext, repository::Key};
+use crate::repository::Key;

 use super::{DeltaFileName, ImageFileName, LayerFileName};

@@ -100,6 +99,22 @@ impl PersistentLayerDesc {
        }
    }

+    pub fn from_filename(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        filename: LayerFileName,
+        file_size: u64,
+    ) -> Self {
+        match filename {
+            LayerFileName::Image(i) => {
+                Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size)
+            }
+            LayerFileName::Delta(d) => {
+                Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size)
+            }
+        }
+    }
+
    /// Get the LSN that the image layer covers.
    pub fn image_layer_lsn(&self) -> Lsn {
        assert!(!self.is_delta);
@@ -173,21 +188,31 @@ impl PersistentLayerDesc {
        self.is_delta
    }

-    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
-            self.tenant_id,
-            self.timeline_id,
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end,
-            self.is_delta,
-            self.is_incremental(),
-            self.file_size,
-        );
-
-        Ok(())
+    pub fn dump(&self) {
+        if self.is_delta {
+            println!(
+                "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
+                self.tenant_id,
+                self.timeline_id,
+                self.key_range.start,
+                self.key_range.end,
+                self.lsn_range.start,
+                self.lsn_range.end,
+                self.is_incremental(),
+                self.file_size,
+            );
+        } else {
+            println!(
+                "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
+                self.tenant_id,
+                self.timeline_id,
+                self.key_range.start,
+                self.key_range.end,
+                self.image_layer_lsn(),
+                self.is_incremental(),
+                self.file_size
+            );
+        }
    }

    pub fn file_size(&self) -> u64 {
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -1,216 +0,0 @@
-//! A RemoteLayer is an in-memory placeholder for a layer file that exists
-//! in remote storage.
-//!
-use crate::config::PageServerConf;
-use crate::context::RequestContext;
-use crate::repository::Key;
-use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
-use crate::tenant::timeline::layer_manager::LayerManager;
-use anyhow::{bail, Result};
-use camino::Utf8PathBuf;
-use pageserver_api::models::HistoricLayerInfo;
-use std::ops::Range;
-use std::sync::Arc;
-
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
-
-use super::filename::{DeltaFileName, ImageFileName};
-use super::{
-    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
-    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
-};
-
-/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
-/// [`DeltaLayer`].
-///
-/// RemoteLayer might be downloaded on-demand during operations which are
-/// allowed download remote layers and during which, it gets replaced with a
-/// concrete `DeltaLayer` or `ImageLayer`.
-///
-/// See: [`crate::context::RequestContext`] for authorization to download
-pub struct RemoteLayer {
-    pub desc: PersistentLayerDesc,
-
-    pub layer_metadata: LayerFileMetadata,
-
-    access_stats: LayerAccessStats,
-
-    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
-
-    /// Has `LayerMap::replace` failed for this (true) or not (false).
-    ///
-    /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
-    /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
-    /// unprocessable, because a LayerMap::replace failed.
-    ///
-    /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
-    /// a possible fast loop between `Timeline::get_reconstruct_data` and
-    /// `Timeline::download_remote_layer`, which also logs.
-    ///
-    /// [`ongoing_download`]: Self::ongoing_download
-    pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
-}
-
-impl std::fmt::Debug for RemoteLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("RemoteLayer")
-            .field("file_name", &self.desc.filename())
-            .field("layer_metadata", &self.layer_metadata)
-            .field("is_incremental", &self.desc.is_incremental())
-            .finish()
-    }
-}
-
-#[async_trait::async_trait]
-impl Layer for RemoteLayer {
-    async fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_state: &mut ValueReconstructState,
-        _ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult> {
-        Err(anyhow::anyhow!("layer {self} needs to be downloaded"))
-    }
-}
-
-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for RemoteLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
-    }
-}
-
-impl AsLayerDesc for RemoteLayer {
-    fn layer_desc(&self) -> &PersistentLayerDesc {
-        &self.desc
-    }
-}
-
-impl PersistentLayer for RemoteLayer {
-    fn local_path(&self) -> Option<Utf8PathBuf> {
-        None
-    }
-
-    fn delete_resident_layer_file(&self) -> Result<()> {
-        bail!("remote layer has no layer file");
-    }
-
-    fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
-        Some(self)
-    }
-
-    fn is_remote_layer(&self) -> bool {
-        true
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.layer_desc().filename().file_name();
-        let lsn_range = self.layer_desc().lsn_range.clone();
-
-        if self.desc.is_delta {
-            HistoricLayerInfo::Delta {
-                layer_file_name,
-                layer_file_size: self.layer_metadata.file_size(),
-                lsn_start: lsn_range.start,
-                lsn_end: lsn_range.end,
-                remote: true,
-                access_stats: self.access_stats.as_api_model(reset),
-            }
-        } else {
-            HistoricLayerInfo::Image {
-                layer_file_name,
-                layer_file_size: self.layer_metadata.file_size(),
-                lsn_start: lsn_range.start,
-                remote: true,
-                access_stats: self.access_stats.as_api_model(reset),
-            }
-        }
-    }
-
-    fn access_stats(&self) -> &LayerAccessStats {
-        &self.access_stats
-    }
-}
-
-impl RemoteLayer {
-    pub fn new_img(
-        tenantid: TenantId,
-        timelineid: TimelineId,
-        fname: &ImageFileName,
-        layer_metadata: &LayerFileMetadata,
-        access_stats: LayerAccessStats,
-    ) -> RemoteLayer {
-        RemoteLayer {
-            desc: PersistentLayerDesc::new_img(
-                tenantid,
-                timelineid,
-                fname.key_range.clone(),
-                fname.lsn,
-                layer_metadata.file_size(),
-            ),
-            layer_metadata: layer_metadata.clone(),
-            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
-            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
-            access_stats,
-        }
-    }
-
-    pub fn new_delta(
-        tenantid: TenantId,
-        timelineid: TimelineId,
-        fname: &DeltaFileName,
-        layer_metadata: &LayerFileMetadata,
-        access_stats: LayerAccessStats,
-    ) -> RemoteLayer {
-        RemoteLayer {
-            desc: PersistentLayerDesc::new_delta(
-                tenantid,
-                timelineid,
-                fname.key_range.clone(),
-                fname.lsn_range.clone(),
-                layer_metadata.file_size(),
-            ),
-            layer_metadata: layer_metadata.clone(),
-            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
-            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
-            access_stats,
-        }
-    }
-
-    /// Create a Layer struct representing this layer, after it has been downloaded.
-    pub(crate) fn create_downloaded_layer(
-        &self,
-        _layer_map_lock_held_witness: &LayerManager,
-        conf: &'static PageServerConf,
-        file_size: u64,
-    ) -> Arc<dyn PersistentLayer> {
-        if self.desc.is_delta {
-            let fname = self.desc.delta_file_name();
-            Arc::new(DeltaLayer::new(
-                conf,
-                self.desc.timeline_id,
-                self.desc.tenant_id,
-                &fname,
-                file_size,
-                self.access_stats
-                    .clone_for_residence_change(LayerResidenceStatus::Resident),
-            ))
-        } else {
-            let fname = self.desc.image_file_name();
-            Arc::new(ImageLayer::new(
-                conf,
-                self.desc.timeline_id,
-                self.desc.tenant_id,
-                &fname,
-                file_size,
-                self.access_stats
-                    .clone_for_residence_change(LayerResidenceStatus::Resident),
-            ))
-        }
-    }
-}
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -159,11 +159,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            // TODO: we shouldn't need to await to find tenant and this could be moved outside of
            // loop, #3501. There are also additional "allowed_errors" in tests.
-            if first {
-                first = false;
-                if random_init_delay(period, &cancel).await.is_err() {
-                    break;
-                }
+            if first && random_init_delay(period, &cancel).await.is_err() {
+                break;
            }

            let started_at = Instant::now();
@@ -183,7 +180,16 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
+            if !first {
+                // The first iteration is typically much slower, because all tenants compete for the
+                // compaction sempahore to run, and because of concurrent startup work like initializing
+                // logical sizes.  To avoid routinely spamming warnings, we suppress this log on first iteration.
+                warn_when_period_overrun(
+                    started_at.elapsed(),
+                    period,
+                    BackgroundLoopKind::Compaction,
+                );
+            }

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -192,6 +198,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            {
                break;
            }
+
+            first = false;
        }
    }
    .await;
@@ -223,11 +231,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            let period = tenant.get_gc_period();

-            if first {
-                first = false;
-                if random_init_delay(period, &cancel).await.is_err() {
-                    break;
-                }
+            if first && random_init_delay(period, &cancel).await.is_err() {
+                break;
            }

            let started_at = Instant::now();
@@ -251,7 +256,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
+            if !first {
+                // The first iteration is typically much slower, because all tenants compete for the
+                // compaction sempahore to run, and because of concurrent startup work like initializing
+                // logical sizes.  To avoid routinely spamming warnings, we suppress this log on first iteration.
+                warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
+            }

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -260,6 +270,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            {
                break;
            }
+
+            first = false;
        }
    }
    .await;
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -29,7 +29,6 @@ use crate::{
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
-        storage_layer::PersistentLayer,
        tasks::{BackgroundLoopKind, RateLimitError},
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
@@ -210,15 +209,26 @@ impl Timeline {
        // NB: all the checks can be invalidated as soon as we release the layer map lock.
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
-        let candidates: Vec<Arc<dyn PersistentLayer>> = {
+        let candidates: Vec<_> = {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                let hist_layer = guard.get_from_desc(&hist_layer);
-                if hist_layer.is_remote_layer() {
-                    continue;
-                }
+
+                // guard against eviction while we inspect it; it might be that eviction_task and
+                // disk_usage_eviction_task both select the same layers to be evicted, and
+                // seemingly free up double the space. both succeeding is of no consequence.
+                let guard = match hist_layer.keep_resident().await {
+                    Ok(Some(l)) => l,
+                    Ok(None) => continue,
+                    Err(e) => {
+                        // these should not happen, but we cannot make them statically impossible right
+                        // now.
+                        tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
+                        continue;
+                    }
+                };

                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
                    // We only use this fallback if there's an implementation error.
@@ -249,7 +259,7 @@ impl Timeline {
                    }
                };
                if no_activity_for > p.threshold {
-                    candidates.push(hist_layer)
+                    candidates.push(guard.drop_eviction_guard())
                }
            }
            candidates
@@ -268,7 +278,7 @@ impl Timeline {
        };

        let results = match self
-            .evict_layer_batch(remote_client, &candidates[..], cancel.clone())
+            .evict_layer_batch(remote_client, &candidates, cancel)
            .await
        {
            Err(pre_err) => {
@@ -279,7 +289,7 @@ impl Timeline {
            Ok(results) => results,
        };
        assert_eq!(results.len(), candidates.len());
-        for (l, result) in candidates.iter().zip(results) {
+        for result in results {
            match result {
                None => {
                    stats.skipped_for_shutdown += 1;
@@ -287,24 +297,10 @@ impl Timeline {
                Some(Ok(())) => {
                    stats.evicted += 1;
                }
-                Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                    stats.not_evictable += 1;
-                }
-                Some(Err(EvictionError::FileNotFound)) => {
+                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
                    // compaction/gc removed the file while we were waiting on layer_removal_cs
                    stats.not_evictable += 1;
                }
-                Some(Err(
-                    e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
-                )) => {
-                    let e = utils::error::report_compact_sources(&e);
-                    warn!(layer = %l, "failed to evict layer: {e}");
-                    stats.not_evictable += 1;
-                }
-                Some(Err(EvictionError::MetadataInconsistency(detail))) => {
-                    warn!(layer = %l, "failed to evict layer: {detail}");
-                    stats.not_evictable += 1;
-                }
            }
        }
        if stats.candidates == stats.not_evictable {
@@ -348,20 +344,7 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        //
-        // It is critical we are responsive to cancellation here. Otherwise, we deadlock with
-        // tenant deletion (holds TENANTS in read mode) any other task that attempts to
-        // acquire TENANTS in write mode before we here call get_tenant.
-        // See https://github.com/neondatabase/neon/issues/5284.
-        let res = tokio::select! {
-            _ = cancel.cancelled() => {
-                return ControlFlow::Break(());
-            }
-            res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => {
-                res
-            }
-        };
-        let tenant = match res {
+        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) {
            Ok(t) => t,
            Err(_) => {
                return ControlFlow::Break(());
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -12,27 +12,16 @@ use crate::{
    tenant::{
        layer_map::{BatchedUpdates, LayerMap},
        storage_layer::{
-            AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, PersistentLayer,
-            PersistentLayerDesc, PersistentLayerKey,
+            AsLayerDesc, InMemoryLayer, Layer, PersistentLayerDesc, PersistentLayerKey,
+            ResidentLayer,
        },
-        timeline::compare_arced_layers,
    },
 };

 /// Provides semantic APIs to manipulate the layer map.
 pub(crate) struct LayerManager {
    layer_map: LayerMap,
-    layer_fmgr: LayerFileManager,
-}
-
-/// After GC, the layer map changes will not be applied immediately. Users should manually apply the changes after
-/// scheduling deletes in remote client.
-pub(crate) struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);
-
-impl ApplyGcResultGuard<'_> {
-    pub(crate) fn flush(self) {
-        self.0.flush();
-    }
+    layer_fmgr: LayerFileManager<Layer>,
 }

 impl LayerManager {
@@ -43,7 +32,7 @@ impl LayerManager {
        }
    }

-    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
+    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
        self.layer_fmgr.get_from_desc(desc)
    }

@@ -55,21 +44,12 @@ impl LayerManager {
        &self.layer_map
    }

-    /// Replace layers in the layer file manager, used in evictions and layer downloads.
-    pub(crate) fn replace_and_verify(
-        &mut self,
-        expected: Arc<dyn PersistentLayer>,
-        new: Arc<dyn PersistentLayer>,
-    ) -> Result<()> {
-        self.layer_fmgr.replace_and_verify(expected, new)
-    }
-
    /// Called from `load_layer_map`. Initialize the layer manager with:
    /// 1. all on-disk layers
    /// 2. next open layer (with disk disk_consistent_lsn LSN)
    pub(crate) fn initialize_local_layers(
        &mut self,
-        on_disk_layers: Vec<Arc<dyn PersistentLayer>>,
+        on_disk_layers: Vec<Layer>,
        next_open_layer_at: Lsn,
    ) {
        let mut updates = self.layer_map.batch_update();
@@ -164,10 +144,19 @@ impl LayerManager {
    }

    /// Add image layers to the layer map, called from `create_image_layers`.
-    pub(crate) fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
+    pub(crate) fn track_new_image_layers(
+        &mut self,
+        image_layers: &[ResidentLayer],
+        metrics: &TimelineMetrics,
+    ) {
        let mut updates = self.layer_map.batch_update();
        for layer in image_layers {
-            Self::insert_historic_layer(Arc::new(layer), &mut updates, &mut self.layer_fmgr);
+            Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+
+            // record these here instead of Layer::finish_creating because otherwise partial
+            // failure with create_image_layers would balloon up the physical size gauge. downside
+            // is that all layers need to be created before metrics are updated.
+            metrics.record_new_file_metrics(layer.layer_desc().file_size);
        }
        updates.flush();
    }
@@ -175,76 +164,71 @@ impl LayerManager {
    /// Flush a frozen layer and add the written delta layer to the layer map.
    pub(crate) fn finish_flush_l0_layer(
        &mut self,
-        delta_layer: Option<DeltaLayer>,
+        delta_layer: Option<&ResidentLayer>,
        frozen_layer_for_check: &Arc<InMemoryLayer>,
+        metrics: &TimelineMetrics,
    ) {
-        let l = self.layer_map.frozen_layers.pop_front();
-        let mut updates = self.layer_map.batch_update();
+        let inmem = self
+            .layer_map
+            .frozen_layers
+            .pop_front()
+            .expect("there must be a inmem layer to flush");

-        // Only one thread may call this function at a time (for this
-        // timeline). If two threads tried to flush the same frozen
+        // Only one task may call this function at a time (for this
+        // timeline). If two tasks tried to flush the same frozen
        // layer to disk at the same time, that would not work.
-        assert!(compare_arced_layers(&l.unwrap(), frozen_layer_for_check));
+        assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));

-        if let Some(delta_layer) = delta_layer {
-            Self::insert_historic_layer(Arc::new(delta_layer), &mut updates, &mut self.layer_fmgr);
+        if let Some(l) = delta_layer {
+            let mut updates = self.layer_map.batch_update();
+            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            metrics.record_new_file_metrics(l.layer_desc().file_size);
+            updates.flush();
        }
-        updates.flush();
    }

    /// Called when compaction is completed.
    pub(crate) fn finish_compact_l0(
        &mut self,
-        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        compact_from: Vec<Arc<dyn PersistentLayer>>,
-        compact_to: Vec<Arc<dyn PersistentLayer>>,
+        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
+        compact_from: &[Layer],
+        compact_to: &[ResidentLayer],
        metrics: &TimelineMetrics,
-    ) -> Result<()> {
+    ) {
        let mut updates = self.layer_map.batch_update();
        for l in compact_to {
-            Self::insert_historic_layer(l, &mut updates, &mut self.layer_fmgr);
+            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            metrics.record_new_file_metrics(l.layer_desc().file_size);
        }
        for l in compact_from {
-            // NB: the layer file identified by descriptor `l` is guaranteed to be present
-            // in the LayerFileManager because compaction kept holding `layer_removal_cs` the entire
-            // time, even though we dropped `Timeline::layers` inbetween.
-            Self::delete_historic_layer(
-                layer_removal_cs.clone(),
-                l,
-                &mut updates,
-                metrics,
-                &mut self.layer_fmgr,
-            )?;
+            Self::delete_historic_layer(layer_removal_cs, l, &mut updates, &mut self.layer_fmgr);
        }
        updates.flush();
-        Ok(())
    }

    /// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
    pub(crate) fn finish_gc_timeline(
        &mut self,
-        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        gc_layers: Vec<Arc<dyn PersistentLayer>>,
-        metrics: &TimelineMetrics,
-    ) -> Result<ApplyGcResultGuard> {
+        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
+        gc_layers: Vec<Layer>,
+    ) {
        let mut updates = self.layer_map.batch_update();
        for doomed_layer in gc_layers {
            Self::delete_historic_layer(
-                layer_removal_cs.clone(),
-                doomed_layer,
+                layer_removal_cs,
+                &doomed_layer,
                &mut updates,
-                metrics,
                &mut self.layer_fmgr,
-            )?; // FIXME: schedule succeeded deletions in timeline.rs `gc_timeline` instead of in batch?
+            );
        }
-        Ok(ApplyGcResultGuard(updates))
+        updates.flush()
    }

    /// Helper function to insert a layer into the layer map and file manager.
    fn insert_historic_layer(
-        layer: Arc<dyn PersistentLayer>,
+        layer: Layer,
        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager,
+        mapping: &mut LayerFileManager<Layer>,
    ) {
        updates.insert_historic(layer.layer_desc().clone());
        mapping.insert(layer);
@@ -254,17 +238,12 @@ impl LayerManager {
    /// Remote storage is not affected by this operation.
    fn delete_historic_layer(
        // we cannot remove layers otherwise, since gc and compaction will race
-        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        layer: Arc<dyn PersistentLayer>,
+        _layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
+        layer: &Layer,
        updates: &mut BatchedUpdates<'_>,
-        metrics: &TimelineMetrics,
-        mapping: &mut LayerFileManager,
-    ) -> anyhow::Result<()> {
+        mapping: &mut LayerFileManager<Layer>,
+    ) {
        let desc = layer.layer_desc();
-        if !layer.is_remote_layer() {
-            layer.delete_resident_layer_file()?;
-            metrics.resident_physical_size_sub(desc.file_size);
-        }

        // TODO Removing from the bottom of the layer map is expensive.
        //      Maybe instead discard all layer map historic versions that
@@ -273,21 +252,18 @@ impl LayerManager {
        //      map index without actually rebuilding the index.
        updates.remove_historic(desc);
        mapping.remove(layer);
-
-        Ok(())
+        layer.garbage_collect_on_drop();
    }

-    pub(crate) fn contains(&self, layer: &Arc<dyn PersistentLayer>) -> bool {
+    pub(crate) fn contains(&self, layer: &Layer) -> bool {
        self.layer_fmgr.contains(layer)
    }
 }

-pub(crate) struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
-    HashMap<PersistentLayerKey, Arc<T>>,
-);
+pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);

-impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<T> {
+impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
+    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
        self.0
@@ -297,14 +273,14 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
            .clone()
    }

-    pub(crate) fn insert(&mut self, layer: Arc<T>) {
+    pub(crate) fn insert(&mut self, layer: T) {
        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
        if present.is_some() && cfg!(debug_assertions) {
            panic!("overwriting a layer: {:?}", layer.layer_desc())
        }
    }

-    pub(crate) fn contains(&self, layer: &Arc<T>) -> bool {
+    pub(crate) fn contains(&self, layer: &T) -> bool {
        self.0.contains_key(&layer.layer_desc().key())
    }

@@ -312,7 +288,7 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
        Self(HashMap::new())
    }

-    pub(crate) fn remove(&mut self, layer: Arc<T>) {
+    pub(crate) fn remove(&mut self, layer: &T) {
        let present = self.0.remove(&layer.layer_desc().key());
        if present.is_none() && cfg!(debug_assertions) {
            panic!(
@@ -321,39 +297,4 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
            )
        }
    }
-
-    pub(crate) fn replace_and_verify(&mut self, expected: Arc<T>, new: Arc<T>) -> Result<()> {
-        let key = expected.layer_desc().key();
-        let other = new.layer_desc().key();
-
-        let expected_l0 = LayerMap::is_l0(expected.layer_desc());
-        let new_l0 = LayerMap::is_l0(new.layer_desc());
-
-        fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
-            "layermap-replace-notfound"
-        ));
-
-        anyhow::ensure!(
-            key == other,
-            "expected and new layer have different keys: {key:?} != {other:?}"
-        );
-
-        anyhow::ensure!(
-            expected_l0 == new_l0,
-            "one layer is l0 while the other is not: {expected_l0} != {new_l0}"
-        );
-
-        if let Some(layer) = self.0.get_mut(&key) {
-            anyhow::ensure!(
-                compare_arced_layers(&expected, layer),
-                "another layer was found instead of expected, expected={expected:?}, new={new:?}",
-                expected = Arc::as_ptr(&expected),
-                new = Arc::as_ptr(layer),
-            );
-            *layer = new;
-            Ok(())
-        } else {
-            anyhow::bail!("layer was not found");
-        }
-    }
 }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -26,8 +26,7 @@ use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
 use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
-use storage_broker::BrokerClientChannel;
-use storage_broker::Streaming;
+use storage_broker::{BrokerClientChannel, Code, Streaming};
 use tokio::select;
 use tracing::*;

@@ -137,8 +136,17 @@ pub(super) async fn connection_manager_loop_step(
            broker_update = broker_subscription.message() => {
                match broker_update {
                    Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
-                    Err(e) => {
-                        error!("broker subscription failed: {e}");
+                    Err(status) => {
+                        match status.code() {
+                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") => {
+                                // tonic's error handling doesn't provide a clear code for disconnections: we get
+                                // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
+                                info!("broker disconnected: {status}");
+                            },
+                            _ => {
+                                warn!("broker subscription failed: {status}");
+                            }
+                        }
                        return ControlFlow::Continue(());
                    }
                    Ok(None) => {
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,4 +1,5 @@
 use super::storage_layer::LayerFileName;
+use super::storage_layer::ResidentLayer;
 use super::Generation;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
@@ -203,18 +204,6 @@ impl UploadQueue {
            UploadQueue::Stopped(stopped) => Ok(stopped),
        }
    }
-
-    pub(crate) fn get_layer_metadata(
-        &self,
-        name: &LayerFileName,
-    ) -> anyhow::Result<Option<LayerFileMetadata>> {
-        match self {
-            UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
-                anyhow::bail!("queue is in state {}", self.as_str())
-            }
-            UploadQueue::Initialized(inner) => Ok(inner.latest_files.get(name).cloned()),
-        }
-    }
 }

 /// An in-progress upload or delete task.
@@ -237,7 +226,7 @@ pub(crate) struct Delete {
 #[derive(Debug)]
 pub(crate) enum UploadOp {
    /// Upload a layer file
-    UploadLayer(LayerFileName, LayerFileMetadata),
+    UploadLayer(ResidentLayer, LayerFileMetadata),

    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),
@@ -252,13 +241,13 @@ pub(crate) enum UploadOp {
 impl std::fmt::Display for UploadOp {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self {
-            UploadOp::UploadLayer(path, metadata) => {
+            UploadOp::UploadLayer(layer, metadata) => {
                write!(
                    f,
                    "UploadLayer({}, size={:?}, gen={:?})",
-                    path.file_name(),
+                    layer,
                    metadata.file_size(),
-                    metadata.generation,
+                    metadata.generation
                )
            }
            UploadOp::UploadMetadata(_, lsn) => {
--- a/poetry.lock
+++ b/poetry.lock
@@ -2447,20 +2447,20 @@ test = ["websockets"]

 [[package]]
 name = "werkzeug"
-version = "2.2.3"
+version = "3.0.1"
 description = "The comprehensive WSGI web application library."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"},
-    {file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"},
+    {file = "werkzeug-3.0.1-py3-none-any.whl", hash = "sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10"},
+    {file = "werkzeug-3.0.1.tar.gz", hash = "sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc"},
 ]

 [package.dependencies]
 MarkupSafe = ">=2.1.1"

 [package.extras]
-watchdog = ["watchdog"]
+watchdog = ["watchdog (>=2.3)"]

 [[package]]
 name = "wrapt"
@@ -2719,4 +2719,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "c5981d8d7c2deadd47c823bc35f86f830c8e320b653d2d3718bade1f4d2dabca"
+content-hash = "74649cf47c52f21b01b096a42044750b1c9677576b405be0489c2909127a9bf1"
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -3,7 +3,9 @@ mod hacks;
 mod link;

 pub use link::LinkAuthError;
+use tokio_postgres::config::AuthKeys;

+use crate::proxy::{handle_try_wake, retry_after};
 use crate::{
    auth::{self, ClientCredentials},
    config::AuthenticationConfig,
@@ -16,8 +18,9 @@ use crate::{
 };
 use futures::TryFutureExt;
 use std::borrow::Cow;
+use std::ops::ControlFlow;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::info;
+use tracing::{error, info, warn};

 /// A product of successful authentication.
 pub struct AuthSuccess<T> {
@@ -117,22 +120,27 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
    }
 }

+pub enum ComputeCredentials {
+    Password(Vec<u8>),
+    AuthKeys(AuthKeys),
+}
+
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
-async fn auth_quirks(
+async fn auth_quirks_creds(
    api: &impl console::Api,
    extra: &ConsoleReqExtra<'_>,
    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
-) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    // If there's no project so far, that entails that client doesn't
    // support SNI or other means of passing the endpoint (project) name.
    // We now expect to see a very specific payload in the place of password.
    if creds.project.is_none() {
        // Password will be checked by the compute node later.
-        return hacks::password_hack(api, extra, creds, client).await;
+        return hacks::password_hack(creds, client).await;
    }

    // Password hack should set the project name.
@@ -143,13 +151,55 @@ async fn auth_quirks(
    // Currently, we use it for websocket connections (latency).
    if allow_cleartext {
        // Password will be checked by the compute node later.
-        return hacks::cleartext_hack(api, extra, creds, client).await;
+        return hacks::cleartext_hack(client).await;
    }

    // Finally, proceed with the main auth flow (SCRAM-based).
    classic::authenticate(api, extra, creds, client, config).await
 }

+/// True to its name, this function encapsulates our current auth trade-offs.
+/// Here, we choose the appropriate auth flow based on circumstances.
+async fn auth_quirks(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    allow_cleartext: bool,
+    config: &'static AuthenticationConfig,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    let auth_stuff = auth_quirks_creds(api, extra, creds, client, allow_cleartext, config).await?;
+
+    let mut num_retries = 0;
+    let mut node = loop {
+        let wake_res = api.wake_compute(extra, creds).await;
+        match handle_try_wake(wake_res, num_retries) {
+            Err(e) => {
+                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                return Err(e.into());
+            }
+            Ok(ControlFlow::Continue(e)) => {
+                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
+            }
+            Ok(ControlFlow::Break(n)) => break n,
+        }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+        tokio::time::sleep(wait_duration).await;
+    };
+
+    match auth_stuff.value {
+        ComputeCredentials::Password(password) => node.config.password(password),
+        ComputeCredentials::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
+    };
+
+    Ok(AuthSuccess {
+        reported_auth_ok: auth_stuff.reported_auth_ok,
+        value: node,
+    })
+}
+
 impl BackendType<'_, ClientCredentials<'_>> {
    /// Get compute endpoint name from the credentials.
    pub fn get_endpoint(&self) -> Option<String> {
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,17 +1,14 @@
-use std::ops::ControlFlow;
-
-use super::AuthSuccess;
+use super::{AuthSuccess, ComputeCredentials};
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
    config::AuthenticationConfig,
-    console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
-    proxy::{handle_try_wake, retry_after},
+    console::{self, AuthInfo, ConsoleReqExtra},
    sasl, scram,
    stream::PqStream,
 };
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{error, info, warn};
+use tracing::{info, warn};

 pub(super) async fn authenticate(
    api: &impl console::Api,
@@ -19,7 +16,7 @@ pub(super) async fn authenticate(
    creds: &ClientCredentials<'_>,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    config: &'static AuthenticationConfig,
-) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    info!("fetching user's authentication info");
    let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
        // If we don't have an authentication secret, we mock one to
@@ -66,38 +63,17 @@ pub(super) async fn authenticate(
                }
            };

-            Some(compute::ScramKeys {
+            compute::ScramKeys {
                client_key: client_key.as_bytes(),
                server_key: secret.server_key.as_bytes(),
-            })
+            }
        }
    };

-    let mut num_retries = 0;
-    let mut node = loop {
-        let wake_res = api.wake_compute(extra, creds).await;
-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                return Err(e.into());
-            }
-            Ok(ControlFlow::Continue(e)) => {
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
-            Ok(ControlFlow::Break(n)) => break n,
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-        tokio::time::sleep(wait_duration).await;
-    };
-    if let Some(keys) = scram_keys {
-        use tokio_postgres::config::AuthKeys;
-        node.config.auth_keys(AuthKeys::ScramSha256(keys));
-    }
-
    Ok(AuthSuccess {
        reported_auth_ok: false,
-        value: node,
+        value: ComputeCredentials::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
+            scram_keys,
+        )),
    })
 }
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,10 +1,6 @@
-use super::AuthSuccess;
+use super::{AuthSuccess, ComputeCredentials};
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
-    console::{
-        self,
-        provider::{CachedNodeInfo, ConsoleReqExtra},
-    },
    stream,
 };
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -15,11 +11,8 @@ use tracing::{info, warn};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub async fn cleartext_hack(
-    api: &impl console::Api,
-    extra: &ConsoleReqExtra<'_>,
-    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    warn!("cleartext auth flow override is enabled, proceeding");
    let password = AuthFlow::new(client)
        .begin(auth::CleartextPassword)
@@ -27,24 +20,19 @@ pub async fn cleartext_hack(
        .authenticate()
        .await?;

-    let mut node = api.wake_compute(extra, creds).await?;
-    node.config.password(password);
-
    // Report tentative success; compute node will check the password anyway.
    Ok(AuthSuccess {
        reported_auth_ok: false,
-        value: node,
+        value: ComputeCredentials::Password(password),
    })
 }

 /// Workaround for clients which don't provide an endpoint (project) name.
 /// Very similar to [`cleartext_hack`], but there's a specific password format.
 pub async fn password_hack(
-    api: &impl console::Api,
-    extra: &ConsoleReqExtra<'_>,
    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    warn!("project not specified, resorting to the password hack auth flow");
    let payload = AuthFlow::new(client)
        .begin(auth::PasswordHack)
@@ -55,12 +43,9 @@ pub async fn password_hack(
    info!(project = &payload.endpoint, "received missing parameter");
    creds.project = Some(payload.endpoint);

-    let mut node = api.wake_compute(extra, creds).await?;
-    node.config.password(payload.password);
-
    // Report tentative success; compute node will check the password anyway.
    Ok(AuthSuccess {
        reported_auth_ok: false,
-        value: node,
+        value: ComputeCredentials::Password(payload.password),
    })
 }
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -4,10 +4,11 @@ use proxy::config::AuthenticationConfig;
 use proxy::config::HttpConfig;
 use proxy::console;
 use proxy::http;
-use proxy::metrics;
+use proxy::usage_metrics;

 use anyhow::bail;
 use proxy::config::{self, ProxyConfig};
+use proxy::serverless;
 use std::pin::pin;
 use std::{borrow::Cow, net::SocketAddr};
 use tokio::net::TcpListener;
@@ -129,14 +130,16 @@ async fn main() -> anyhow::Result<()> {
        cancellation_token.clone(),
    ));

-    if let Some(wss_address) = args.wss {
-        let wss_address: SocketAddr = wss_address.parse()?;
-        info!("Starting wss on {wss_address}");
-        let wss_listener = TcpListener::bind(wss_address).await?;
+    // TODO: rename the argument to something like serverless.
+    // It now covers more than just websockets, it also covers SQL over HTTP.
+    if let Some(serverless_address) = args.wss {
+        let serverless_address: SocketAddr = serverless_address.parse()?;
+        info!("Starting wss on {serverless_address}");
+        let serverless_listener = TcpListener::bind(serverless_address).await?;

-        client_tasks.spawn(http::websocket::task_main(
+        client_tasks.spawn(serverless::task_main(
            config,
-            wss_listener,
+            serverless_listener,
            cancellation_token.clone(),
        ));
    }
@@ -144,11 +147,11 @@ async fn main() -> anyhow::Result<()> {
    // maintenance tasks. these never return unless there's an error
    let mut maintenance_tasks = JoinSet::new();
    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
-    maintenance_tasks.spawn(http::server::task_main(http_listener));
+    maintenance_tasks.spawn(http::health_server::task_main(http_listener));
    maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));

    if let Some(metrics_config) = &config.metric_collection {
-        maintenance_tasks.spawn(metrics::task_main(metrics_config));
+        maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
    }

    let maintenance = loop {
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -90,7 +90,11 @@ pub mod errors {
                    status: http::StatusCode::LOCKED,
                    ref text,
                } => {
-                    !text.contains("written data quota exceeded")
+                    // written data quota exceeded
+                    // data transfer quota exceeded
+                    // compute time quota exceeded
+                    // logical size quota exceeded
+                    !text.contains("quota exceeded")
                        && !text.contains("the limit for current plan reached")
                }
                // retry server errors
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -2,10 +2,7 @@
 //! Other modules should use stuff from this module instead of
 //! directly relying on deps like `reqwest` (think loose coupling).

-pub mod conn_pool;
-pub mod server;
-pub mod sql_over_http;
-pub mod websocket;
+pub mod health_server;

 use std::{sync::Arc, time::Duration};

--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -14,14 +14,15 @@ pub mod console;
 pub mod error;
 pub mod http;
 pub mod logging;
-pub mod metrics;
 pub mod parse;
 pub mod protocol2;
 pub mod proxy;
 pub mod sasl;
 pub mod scram;
+pub mod serverless;
 pub mod stream;
 pub mod url;
+pub mod usage_metrics;
 pub mod waiters;

 /// Handle unix signals appropriately.
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -8,9 +8,9 @@ use crate::{
    config::{AuthenticationConfig, ProxyConfig, TlsConfig},
    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
    http::StatusCode,
-    metrics::{Ids, USAGE_METRICS},
    protocol2::WithClientIp,
    stream::{PqStream, Stream},
+    usage_metrics::{Ids, USAGE_METRICS},
 };
 use anyhow::{bail, Context};
 use async_trait::async_trait;
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -1,235 +1,36 @@
-use crate::{
-    cancellation::CancelMap,
-    config::ProxyConfig,
-    error::io_error,
-    protocol2::{ProxyProtocolAccept, WithClientIp},
-    proxy::{
-        handle_client, ClientMode, NUM_CLIENT_CONNECTION_CLOSED_COUNTER,
-        NUM_CLIENT_CONNECTION_OPENED_COUNTER,
-    },
-};
+//! Routers for our serverless APIs
+//!
+//! Handles both SQL over HTTP and SQL over Websockets.
+
+mod conn_pool;
+mod sql_over_http;
+mod websocket;
+
 use anyhow::bail;
-use bytes::{Buf, Bytes};
-use futures::{Sink, Stream, StreamExt};
+use hyper::StatusCode;
+pub use reqwest_middleware::{ClientWithMiddleware, Error};
+pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+
+use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
+use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER};
+use crate::{cancellation::CancelMap, config::ProxyConfig};
+use futures::StreamExt;
 use hyper::{
    server::{
        accept,
        conn::{AddrIncoming, AddrStream},
    },
-    upgrade::Upgraded,
-    Body, Method, Request, Response, StatusCode,
+    Body, Method, Request, Response,
 };
-use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
-use pin_project_lite::pin_project;

-use std::{
-    future::ready,
-    pin::Pin,
-    sync::Arc,
-    task::{ready, Context, Poll},
-};
+use std::task::Poll;
+use std::{future::ready, sync::Arc};
 use tls_listener::TlsListener;
-use tokio::{
-    io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
-    net::TcpListener,
-};
+use tokio::net::TcpListener;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};

-// TODO: use `std::sync::Exclusive` once it's stabilized.
-// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
-use sync_wrapper::SyncWrapper;
-
-use super::{conn_pool::GlobalConnPool, sql_over_http};
-
-pin_project! {
-    /// This is a wrapper around a [`WebSocketStream`] that
-    /// implements [`AsyncRead`] and [`AsyncWrite`].
-    pub struct WebSocketRw {
-        #[pin]
-        stream: SyncWrapper<WebSocketStream<Upgraded>>,
-        bytes: Bytes,
-    }
-}
-
-impl WebSocketRw {
-    pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
-        Self {
-            stream: stream.into(),
-            bytes: Bytes::new(),
-        }
-    }
-}
-
-impl AsyncWrite for WebSocketRw {
-    fn poll_write(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        buf: &[u8],
-    ) -> Poll<io::Result<usize>> {
-        let mut stream = self.project().stream.get_pin_mut();
-
-        ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
-        match stream.as_mut().start_send(Message::Binary(buf.into())) {
-            Ok(()) => Poll::Ready(Ok(buf.len())),
-            Err(e) => Poll::Ready(Err(io_error(e))),
-        }
-    }
-
-    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let stream = self.project().stream.get_pin_mut();
-        stream.poll_flush(cx).map_err(io_error)
-    }
-
-    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let stream = self.project().stream.get_pin_mut();
-        stream.poll_close(cx).map_err(io_error)
-    }
-}
-
-impl AsyncRead for WebSocketRw {
-    fn poll_read(
-        mut self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        buf: &mut ReadBuf<'_>,
-    ) -> Poll<io::Result<()>> {
-        if buf.remaining() > 0 {
-            let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
-            let len = std::cmp::min(bytes.len(), buf.remaining());
-            buf.put_slice(&bytes[..len]);
-            self.consume(len);
-        }
-
-        Poll::Ready(Ok(()))
-    }
-}
-
-impl AsyncBufRead for WebSocketRw {
-    fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
-        // Please refer to poll_fill_buf's documentation.
-        const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
-
-        let mut this = self.project();
-        loop {
-            if !this.bytes.chunk().is_empty() {
-                let chunk = (*this.bytes).chunk();
-                return Poll::Ready(Ok(chunk));
-            }
-
-            let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
-            match res.transpose().map_err(io_error)? {
-                Some(message) => match message {
-                    Message::Ping(_) => {}
-                    Message::Pong(_) => {}
-                    Message::Text(text) => {
-                        // We expect to see only binary messages.
-                        let error = "unexpected text message in the websocket";
-                        warn!(length = text.len(), error);
-                        return Poll::Ready(Err(io_error(error)));
-                    }
-                    Message::Frame(_) => {
-                        // This case is impossible according to Frame's doc.
-                        panic!("unexpected raw frame in the websocket");
-                    }
-                    Message::Binary(chunk) => {
-                        assert!(this.bytes.is_empty());
-                        *this.bytes = Bytes::from(chunk);
-                    }
-                    Message::Close(_) => return EOF,
-                },
-                None => return EOF,
-            }
-        }
-    }
-
-    fn consume(self: Pin<&mut Self>, amount: usize) {
-        self.project().bytes.advance(amount);
-    }
-}
-
-async fn serve_websocket(
-    websocket: HyperWebsocket,
-    config: &'static ProxyConfig,
-    cancel_map: &CancelMap,
-    session_id: uuid::Uuid,
-    hostname: Option<String>,
-) -> anyhow::Result<()> {
-    let websocket = websocket.await?;
-    handle_client(
-        config,
-        cancel_map,
-        session_id,
-        WebSocketRw::new(websocket),
-        ClientMode::Websockets { hostname },
-    )
-    .await?;
-    Ok(())
-}
-
-async fn ws_handler(
-    mut request: Request<Body>,
-    config: &'static ProxyConfig,
-    conn_pool: Arc<GlobalConnPool>,
-    cancel_map: Arc<CancelMap>,
-    session_id: uuid::Uuid,
-    sni_hostname: Option<String>,
-) -> Result<Response<Body>, ApiError> {
-    let host = request
-        .headers()
-        .get("host")
-        .and_then(|h| h.to_str().ok())
-        .and_then(|h| h.split(':').next())
-        .map(|s| s.to_string());
-
-    // Check if the request is a websocket upgrade request.
-    if hyper_tungstenite::is_upgrade_request(&request) {
-        info!(session_id = ?session_id, "performing websocket upgrade");
-
-        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
-            .map_err(|e| ApiError::BadRequest(e.into()))?;
-
-        tokio::spawn(
-            async move {
-                if let Err(e) =
-                    serve_websocket(websocket, config, &cancel_map, session_id, host).await
-                {
-                    error!(session_id = ?session_id, "error in websocket connection: {e:#}");
-                }
-            }
-            .in_current_span(),
-        );
-
-        // Return the response so the spawned future can continue.
-        Ok(response)
-    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
-    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
-    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        sql_over_http::handle(
-            request,
-            sni_hostname,
-            conn_pool,
-            session_id,
-            &config.http_config,
-        )
-        .await
-    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
-        Response::builder()
-            .header("Allow", "OPTIONS, POST")
-            .header("Access-Control-Allow-Origin", "*")
-            .header(
-                "Access-Control-Allow-Headers",
-                "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
-            )
-            .header("Access-Control-Max-Age", "86400" /* 24 hours */)
-            .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
-            .body(Body::empty())
-            .map_err(|e| ApiError::InternalServerError(e.into()))
-    } else {
-        json_response(StatusCode::BAD_REQUEST, "query is not supported")
-    }
-}
-
 pub async fn task_main(
    config: &'static ProxyConfig,
    ws_listener: TcpListener,
@@ -239,7 +40,7 @@ pub async fn task_main(
        info!("websocket server has shut down");
    }

-    let conn_pool: Arc<GlobalConnPool> = GlobalConnPool::new(config);
+    let conn_pool = conn_pool::GlobalConnPool::new(config);

    // shutdown the connection pool
    tokio::spawn({
@@ -300,13 +101,15 @@ pub async fn task_main(
                            let cancel_map = Arc::new(CancelMap::default());
                            let session_id = uuid::Uuid::new_v4();

-                            ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
-                                .instrument(info_span!(
-                                    "ws-client",
-                                    session = %session_id,
-                                    %peer_addr,
-                                ))
-                                .await
+                            request_handler(
+                                req, config, conn_pool, cancel_map, session_id, sni_name,
+                            )
+                            .instrument(info_span!(
+                                "serverless",
+                                session = %session_id,
+                                %peer_addr,
+                            ))
+                            .await
                        }
                    },
                )))
@@ -359,3 +162,65 @@ where
        self.inner.call(req)
    }
 }
+
+async fn request_handler(
+    mut request: Request<Body>,
+    config: &'static ProxyConfig,
+    conn_pool: Arc<conn_pool::GlobalConnPool>,
+    cancel_map: Arc<CancelMap>,
+    session_id: uuid::Uuid,
+    sni_hostname: Option<String>,
+) -> Result<Response<Body>, ApiError> {
+    let host = request
+        .headers()
+        .get("host")
+        .and_then(|h| h.to_str().ok())
+        .and_then(|h| h.split(':').next())
+        .map(|s| s.to_string());
+
+    // Check if the request is a websocket upgrade request.
+    if hyper_tungstenite::is_upgrade_request(&request) {
+        info!(session_id = ?session_id, "performing websocket upgrade");
+
+        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
+            .map_err(|e| ApiError::BadRequest(e.into()))?;
+
+        tokio::spawn(
+            async move {
+                if let Err(e) =
+                    websocket::serve_websocket(websocket, config, &cancel_map, session_id, host)
+                        .await
+                {
+                    error!(session_id = ?session_id, "error in websocket connection: {e:#}");
+                }
+            }
+            .in_current_span(),
+        );
+
+        // Return the response so the spawned future can continue.
+        Ok(response)
+    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
+        sql_over_http::handle(
+            request,
+            sni_hostname,
+            conn_pool,
+            session_id,
+            &config.http_config,
+        )
+        .await
+    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
+        Response::builder()
+            .header("Allow", "OPTIONS, POST")
+            .header("Access-Control-Allow-Origin", "*")
+            .header(
+                "Access-Control-Allow-Headers",
+                "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
+            )
+            .header("Access-Control-Max-Age", "86400" /* 24 hours */)
+            .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
+            .body(Body::empty())
+            .map_err(|e| ApiError::InternalServerError(e.into()))
+    } else {
+        json_response(StatusCode::BAD_REQUEST, "query is not supported")
+    }
+}
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -22,8 +22,8 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};

 use crate::{
    auth, console,
-    metrics::{Ids, MetricCounter, USAGE_METRICS},
    proxy::{LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER},
+    usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
 };
 use crate::{compute, config};

@@ -191,22 +191,39 @@ impl GlobalConnPool {
        // ok return cached connection if found and establish a new one otherwise
        let new_client = if let Some(client) = client {
            if client.inner.is_closed() {
-                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
-                connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
+                let conn_id = uuid::Uuid::new_v4();
+                info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
+                connect_to_compute(
+                    self.proxy_config,
+                    conn_info,
+                    conn_id,
+                    session_id,
+                    latency_timer,
+                )
+                .await
            } else {
                info!("pool: reusing connection '{conn_info}'");
                client.session.send(session_id)?;
                latency_timer.pool_hit();
                latency_timer.success();
                return Ok(Client {
+                    conn_id: client.conn_id,
                    inner: Some(client),
                    span: Span::current(),
                    pool,
                });
            }
        } else {
-            info!("pool: opening a new connection '{conn_info}'");
-            connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
+            let conn_id = uuid::Uuid::new_v4();
+            info!(%conn_id, "pool: opening a new connection '{conn_info}'");
+            connect_to_compute(
+                self.proxy_config,
+                conn_info,
+                conn_id,
+                session_id,
+                latency_timer,
+            )
+            .await
        };

        match &new_client {
@@ -243,6 +260,7 @@ impl GlobalConnPool {
        }

        new_client.map(|inner| Client {
+            conn_id: inner.conn_id,
            inner: Some(inner),
            span: Span::current(),
            pool,
@@ -250,16 +268,18 @@ impl GlobalConnPool {
    }

    fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
+        let conn_id = client.conn_id;
+
        // We want to hold this open while we return. This ensures that the pool can't close
        // while we are in the middle of returning the connection.
        let closed = self.closed.read();
        if *closed {
-            info!("pool: throwing away connection '{conn_info}' because pool is closed");
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is closed");
            return Ok(());
        }

        if client.inner.is_closed() {
-            info!("pool: throwing away connection '{conn_info}' because connection is closed");
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
            return Ok(());
        }

@@ -291,9 +311,9 @@ impl GlobalConnPool {

        // do logging outside of the mutex
        if returned {
-            info!("pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
        } else {
-            info!("pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
        }

        Ok(())
@@ -340,6 +360,7 @@ impl GlobalConnPool {
 struct TokioMechanism<'a> {
    conn_info: &'a ConnInfo,
    session_id: uuid::Uuid,
+    conn_id: uuid::Uuid,
 }

 #[async_trait]
@@ -353,7 +374,14 @@ impl ConnectMechanism for TokioMechanism<'_> {
        node_info: &console::CachedNodeInfo,
        timeout: time::Duration,
    ) -> Result<Self::Connection, Self::ConnectError> {
-        connect_to_compute_once(node_info, self.conn_info, timeout, self.session_id).await
+        connect_to_compute_once(
+            node_info,
+            self.conn_info,
+            timeout,
+            self.conn_id,
+            self.session_id,
+        )
+        .await
    }

    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
@@ -366,6 +394,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
 async fn connect_to_compute(
    config: &config::ProxyConfig,
    conn_info: &ConnInfo,
+    conn_id: uuid::Uuid,
    session_id: uuid::Uuid,
    latency_timer: LatencyTimer,
 ) -> anyhow::Result<ClientInner> {
@@ -401,6 +430,7 @@ async fn connect_to_compute(

    crate::proxy::connect_to_compute(
        &TokioMechanism {
+            conn_id,
            conn_info,
            session_id,
        },
@@ -416,6 +446,7 @@ async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
    conn_info: &ConnInfo,
    timeout: time::Duration,
+    conn_id: uuid::Uuid,
    mut session: uuid::Uuid,
 ) -> Result<ClientInner, tokio_postgres::Error> {
    let mut config = (*node_info.config).clone();
@@ -430,7 +461,6 @@ async fn connect_to_compute_once(

    let (tx, mut rx) = tokio::sync::watch::channel(session);

-    let conn_id = uuid::Uuid::new_v4();
    let span = info_span!(parent: None, "connection", %conn_id);
    span.in_scope(|| {
        info!(%conn_info, %session, "new connection");
@@ -484,6 +514,7 @@ async fn connect_to_compute_once(
        inner: client,
        session: tx,
        ids,
+        conn_id,
    })
 }

@@ -491,6 +522,7 @@ struct ClientInner {
    inner: tokio_postgres::Client,
    session: tokio::sync::watch::Sender<uuid::Uuid>,
    ids: Ids,
+    conn_id: uuid::Uuid,
 }

 impl Client {
@@ -500,12 +532,14 @@ impl Client {
 }

 pub struct Client {
+    conn_id: uuid::Uuid,
    span: Span,
    inner: Option<ClientInner>,
    pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
 }

 pub struct Discard<'a> {
+    conn_id: uuid::Uuid,
    pool: &'a mut Option<(ConnInfo, Arc<GlobalConnPool>)>,
 }

@@ -514,6 +548,7 @@ impl Client {
        let Self {
            inner,
            pool,
+            conn_id,
            span: _,
        } = self;
        (
@@ -521,7 +556,10 @@ impl Client {
                .as_mut()
                .expect("client inner should not be removed")
                .inner,
-            Discard { pool },
+            Discard {
+                pool,
+                conn_id: *conn_id,
+            },
        )
    }

@@ -537,13 +575,13 @@ impl Discard<'_> {
    pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
        if status != ReadyForQueryStatus::Idle {
            if let Some((conn_info, _)) = self.pool.take() {
-                info!("pool: throwing away connection '{conn_info}' because connection is not idle")
+                info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
            }
        }
    }
    pub fn discard(&mut self) {
        if let Some((conn_info, _)) = self.pool.take() {
-            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
+            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
        }
    }
 }
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -0,0 +1,146 @@
+use crate::{
+    cancellation::CancelMap,
+    config::ProxyConfig,
+    error::io_error,
+    proxy::{handle_client, ClientMode},
+};
+use bytes::{Buf, Bytes};
+use futures::{Sink, Stream};
+use hyper::upgrade::Upgraded;
+use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
+use pin_project_lite::pin_project;
+
+use std::{
+    pin::Pin,
+    task::{ready, Context, Poll},
+};
+use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
+use tracing::warn;
+
+// TODO: use `std::sync::Exclusive` once it's stabilized.
+// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
+use sync_wrapper::SyncWrapper;
+
+pin_project! {
+    /// This is a wrapper around a [`WebSocketStream`] that
+    /// implements [`AsyncRead`] and [`AsyncWrite`].
+    pub struct WebSocketRw {
+        #[pin]
+        stream: SyncWrapper<WebSocketStream<Upgraded>>,
+        bytes: Bytes,
+    }
+}
+
+impl WebSocketRw {
+    pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
+        Self {
+            stream: stream.into(),
+            bytes: Bytes::new(),
+        }
+    }
+}
+
+impl AsyncWrite for WebSocketRw {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<io::Result<usize>> {
+        let mut stream = self.project().stream.get_pin_mut();
+
+        ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
+        match stream.as_mut().start_send(Message::Binary(buf.into())) {
+            Ok(()) => Poll::Ready(Ok(buf.len())),
+            Err(e) => Poll::Ready(Err(io_error(e))),
+        }
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let stream = self.project().stream.get_pin_mut();
+        stream.poll_flush(cx).map_err(io_error)
+    }
+
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let stream = self.project().stream.get_pin_mut();
+        stream.poll_close(cx).map_err(io_error)
+    }
+}
+
+impl AsyncRead for WebSocketRw {
+    fn poll_read(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        if buf.remaining() > 0 {
+            let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
+            let len = std::cmp::min(bytes.len(), buf.remaining());
+            buf.put_slice(&bytes[..len]);
+            self.consume(len);
+        }
+
+        Poll::Ready(Ok(()))
+    }
+}
+
+impl AsyncBufRead for WebSocketRw {
+    fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
+        // Please refer to poll_fill_buf's documentation.
+        const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
+
+        let mut this = self.project();
+        loop {
+            if !this.bytes.chunk().is_empty() {
+                let chunk = (*this.bytes).chunk();
+                return Poll::Ready(Ok(chunk));
+            }
+
+            let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
+            match res.transpose().map_err(io_error)? {
+                Some(message) => match message {
+                    Message::Ping(_) => {}
+                    Message::Pong(_) => {}
+                    Message::Text(text) => {
+                        // We expect to see only binary messages.
+                        let error = "unexpected text message in the websocket";
+                        warn!(length = text.len(), error);
+                        return Poll::Ready(Err(io_error(error)));
+                    }
+                    Message::Frame(_) => {
+                        // This case is impossible according to Frame's doc.
+                        panic!("unexpected raw frame in the websocket");
+                    }
+                    Message::Binary(chunk) => {
+                        assert!(this.bytes.is_empty());
+                        *this.bytes = Bytes::from(chunk);
+                    }
+                    Message::Close(_) => return EOF,
+                },
+                None => return EOF,
+            }
+        }
+    }
+
+    fn consume(self: Pin<&mut Self>, amount: usize) {
+        self.project().bytes.advance(amount);
+    }
+}
+
+pub async fn serve_websocket(
+    websocket: HyperWebsocket,
+    config: &'static ProxyConfig,
+    cancel_map: &CancelMap,
+    session_id: uuid::Uuid,
+    hostname: Option<String>,
+) -> anyhow::Result<()> {
+    let websocket = websocket.await?;
+    handle_client(
+        config,
+        cancel_map,
+        session_id,
+        WebSocketRw::new(websocket),
+        ClientMode::Websockets { hostname },
+    )
+    .await?;
+    Ok(())
+}
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,7 @@ backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
-Werkzeug = "^2.2.3"
+Werkzeug = "^3.0.1"
 pytest-order = "^1.1.0"
 allure-pytest = "^2.13.2"
 pytest-asyncio = "^0.21.0"
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -15,28 +15,15 @@ The script fetches the durations of benchmarks from the database and stores it i

 BENCHMARKS_DURATION_QUERY = """
    SELECT
-        DISTINCT parent_suite, suite, test,
-        PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration_ms) as percentile_ms
-    FROM
-        (
-            SELECT
-                jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
-                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
-                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp,
-                (jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'duration')::int as duration_ms
-            FROM
-                regress_test_results
-            WHERE
-                reference = 'refs/heads/main'
-        ) data
+        DISTINCT parent_suite, suite, name,
+        PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration) as percentile_ms
+    FROM results
    WHERE
-        timestamp > CURRENT_DATE - INTERVAL '%s' day
+        started_at > CURRENT_DATE - INTERVAL '%s' day
        AND parent_suite = 'test_runner.performance'
        AND status = 'passed'
    GROUP BY
-        parent_suite, suite, test
+        parent_suite, suite, name
    ;
 """

@@ -44,68 +31,69 @@ BENCHMARKS_DURATION_QUERY = """
 # the total duration varies from 8 to 40 minutes.
 # We use some pre-collected durations as a fallback to have a better distribution.
 FALLBACK_DURATION = {
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 57.0,
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 28.0,
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 71.0,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 27.0,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 11.0,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 30.0,
-    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 40.0,
-    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 5.0,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.0,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 10.0,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 19.0,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.0,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 30.0,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 60.0,
-    "test_runner/performance/test_compaction.py::test_compaction": 77.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.0,
-    "test_runner/performance/test_copy.py::test_copy[neon]": 12.0,
-    "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.0,
-    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 284.0,
-    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 11.0,
-    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 7.0,
-    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 85.0,
-    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 29.0,
-    "test_runner/performance/test_layer_map.py::test_layer_map": 44.0,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 16.0,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 67.0,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 67.0,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 80.0,
-    "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 102.0,
-    "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.0,
-    "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 9.0,
-    "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.0,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 4.0,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 80.0,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 68.0,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.0,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 11.0,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 10.0,
-    "test_runner/performance/test_startup.py::test_startup_simple": 2.0,
-    "test_runner/performance/test_startup.py::test_startup": 539.0,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 375.0,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 370.0,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 94.0,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 164.0,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 274.0,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 949.0,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 142.0,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 151.0,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 182.0,
-    "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 13.0,
-    "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.0,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 62.144,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 90.941,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 26.053,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.67,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 14.497,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 18.852,
+    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 26.572,
+    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 6.259,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 21.206,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.474,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 11.262,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 94.225,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 68.159,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 76.719,
+    "test_runner/performance/test_compaction.py::test_compaction": 110.222,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 10.743,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.541,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.109,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.121,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.3,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.086,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 12.024,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.14,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.375,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.075,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.147,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.321,
+    "test_runner/performance/test_copy.py::test_copy[neon]": 16.579,
+    "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.094,
+    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 590.157,
+    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.102,
+    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 8.677,
+    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 31.079,
+    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 38.119,
+    "test_runner/performance/test_layer_map.py::test_layer_map": 24.784,
+    "test_runner/performance/test_logical_replication.py::test_logical_replication": 117.707,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 21.194,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 59.068,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 73.235,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 82.586,
+    "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 106.536,
+    "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 98.753,
+    "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 6.975,
+    "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 3.69,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.529,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 64.522,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 40.964,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
+    "test_runner/performance/test_startup.py::test_startup": 890.114,
+    "test_runner/performance/test_startup.py::test_startup_simple": 2.51,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 113.653,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 233.728,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 419.093,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 982.461,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 116.522,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 115.583,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 155.282,
+    "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 26.704,
+    "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.088,
 }


@@ -130,7 +118,7 @@ def main(args: argparse.Namespace):
        res = FALLBACK_DURATION

    for row in rows:
-        pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}"
+        pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['name']}"
        duration = row["percentile_ms"] / 1000
        logging.info(f"\t{pytest_name}: {duration}")
        res[pytest_name] = duration
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -9,28 +9,15 @@ from typing import DefaultDict, Dict
 import psycopg2
 import psycopg2.extras

-# We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
 FLAKY_TESTS_QUERY = """
    SELECT
-        DISTINCT parent_suite, suite, REGEXP_REPLACE(test, '(release|debug)-pg(\\d+)-?', '') as deparametrized_test
-    FROM
-        (
-            SELECT
-                reference,
-                jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
-                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'retriesStatusChange' as retries_status_change,
-                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' ->> 'start')::bigint / 1000)::date as timestamp
-            FROM
-                regress_test_results
-        ) data
+        DISTINCT parent_suite, suite, name
+    FROM results
    WHERE
-        timestamp > CURRENT_DATE - INTERVAL '%s' day
+        started_at > CURRENT_DATE - INTERVAL '%s' day
        AND (
            (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
-            OR retries_status_change::boolean
+            OR flaky
        )
    ;
 """
@@ -63,12 +50,14 @@ def main(args: argparse.Namespace):
        if row["parent_suite"] != "test_runner.regress":
            continue

-        deparametrized_test = row["deparametrized_test"]
-        dash_if_needed = "" if deparametrized_test.endswith("[]") else "-"
-        parametrized_test = deparametrized_test.replace(
-            "[",
-            f"[{build_type}-pg{pg_version}{dash_if_needed}",
-        )
+        if row["name"].endswith("]"):
+            parametrized_test = row["name"].replace(
+                "[",
+                f"[{build_type}-pg{pg_version}-",
+            )
+        else:
+            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}]"
+
        res[row["parent_suite"]][row["suite"]][parametrized_test] = True

        logging.info(
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -4,7 +4,7 @@ use std::task::{Context, Poll};
 use std::time::Duration;
 use tonic::codegen::StdError;
 use tonic::transport::{ClientTlsConfig, Endpoint};
-use tonic::{transport::Channel, Code, Status};
+use tonic::{transport::Channel, Status};
 use utils::id::{TenantId, TenantTimelineId, TimelineId};

 use proto::{
@@ -23,6 +23,7 @@ pub mod proto {
 pub mod metrics;

 // Re-exports to avoid direct tonic dependency in user crates.
+pub use tonic::Code;
 pub use tonic::Request;
 pub use tonic::Streaming;

--- a/test_runner/fixtures/log_helper.py
+++ b/test_runner/fixtures/log_helper.py
@@ -22,6 +22,11 @@ https://docs.pytest.org/en/6.2.x/logging.html
 # log format is specified in pytest.ini file
 LOGGING = {
    "version": 1,
+    "formatters": {
+        "standard": {
+            "datefmt": "%m/%d/%Y %I:%M:%SZ %p %Z",
+        }
+    },
    "loggers": {
        "root": {"level": "INFO"},
        "root.safekeeper_async": {"level": "INFO"},  # a lot of logs on DEBUG level
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1616,7 +1616,7 @@ class NeonPageserver(PgProtocol):
            ".*wait for layer upload ops to complete.*",  # .*Caused by:.*wait_completion aborted because upload queue was stopped
            ".*gc_loop.*Gc failed, retrying in.*timeline is Stopping",  # When gc checks timeline state after acquiring layer_removal_cs
            ".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant",  # Tenant::gc precondition
-            ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping",  # When compaction checks timeline state after acquiring layer_removal_cs
+            ".*compaction_loop.*Compaction failed, retrying in.*timeline or pageserver is shutting down",  # When compaction checks timeline state after acquiring layer_removal_cs
            ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
            ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
            ".*task iteration took longer than the configured period.*",
@@ -1707,7 +1707,7 @@ class NeonPageserver(PgProtocol):

    @property
    def workdir(self) -> Path:
-        return Path(os.path.join(self.env.repo_dir, f"pageserver_{self.id}"))
+        return self.env.repo_dir / f"pageserver_{self.id}"

    def assert_no_errors(self):
        logfile = open(os.path.join(self.workdir, "pageserver.log"), "r")
@@ -1719,6 +1719,11 @@ class NeonPageserver(PgProtocol):
                break

            if error_or_warn.search(line):
+                # Is this a torn log line?  This happens when force-killing a process and restarting
+                # Example: "2023-10-25T09:38:31.752314Z  WARN deletion executo2023-10-25T09:38:31.875947Z  INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192"
+                if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line):
+                    continue
+
                # It's an ERROR or WARN. Is it in the allow-list?
                for a in self.allowed_errors:
                    if re.match(a, line):
@@ -1768,6 +1773,16 @@ class NeonPageserver(PgProtocol):
        client = self.http_client()
        return client.tenant_attach(tenant_id, config, config_null, generation=generation)

+    def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs):
+        # This API is only for use when generations are enabled
+        assert self.env.attachment_service is not None
+
+        if config["mode"].startswith("Attached") and "generation" not in config:
+            config["generation"] = self.env.attachment_service.attach_hook(tenant_id, self.id)
+
+        client = self.http_client()
+        return client.tenant_location_conf(tenant_id, config, **kwargs)
+

 def append_pageserver_param_overrides(
    params_to_update: List[str],
@@ -2624,6 +2639,7 @@ class EndpointFactory:
        lsn: Optional[Lsn] = None,
        hot_standby: bool = False,
        config_lines: Optional[List[str]] = None,
+        pageserver_id: Optional[int] = None,
    ) -> Endpoint:
        ep = Endpoint(
            self.env,
@@ -2643,6 +2659,7 @@ class EndpointFactory:
            lsn=lsn,
            hot_standby=hot_standby,
            config_lines=config_lines,
+            pageserver_id=pageserver_id,
        )

    def stop_all(self) -> "EndpointFactory":
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -247,6 +247,23 @@ class PageserverHttpClient(requests.Session):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
        self.verbose_error(res)

+    def tenant_location_conf(
+        self, tenant_id: TenantId, location_conf=dict[str, Any], flush_ms=None
+    ):
+        body = location_conf.copy()
+        body["tenant_id"] = str(tenant_id)
+
+        params = {}
+        if flush_ms is not None:
+            params["flush_ms"] = str(flush_ms)
+
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config",
+            json=body,
+            params=params,
+        )
+        self.verbose_error(res)
+
    def tenant_delete(self, tenant_id: TenantId):
        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
        self.verbose_error(res)
@@ -441,13 +458,13 @@ class PageserverHttpClient(requests.Session):
        assert res_json is None

    def timeline_get_lsn_by_timestamp(
-        self, tenant_id: TenantId, timeline_id: TimelineId, timestamp
+        self, tenant_id: TenantId, timeline_id: TimelineId, timestamp, version: int
    ):
        log.info(
            f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
        )
        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}&version={version}",
        )
        self.verbose_error(res)
        res_json = res.json()
@@ -650,6 +667,14 @@ class PageserverHttpClient(requests.Session):
        res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
        self.verbose_error(res)

+    def secondary_tenant_upload(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/secondary/{tenant_id}/upload")
+        self.verbose_error(res)
+
+    def secondary_tenant_download(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/secondary/{tenant_id}/download")
+        self.verbose_error(res)
+
    def post_tracing_event(self, level: str, message: str):
        res = self.post(
            f"http://localhost:{self.port}/v1/tracing/event",
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -249,7 +249,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
            # this has been seen in the wild by tests with the below contradicting logging
            # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865
            # this seems like a mock_s3 issue
-            log.warn(
+            log.warning(
                f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
            )
            keys = 0
@@ -257,7 +257,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
            # this has been seen in one case with mock_s3:
            # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
            # looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
-            log.warn(
+            log.warning(
                f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
            )

--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -0,0 +1,133 @@
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnv,
+    last_flush_lsn_upload,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.types import TenantId, TimelineId
+
+
+class Workload:
+    """
+    This is not a general purpose load generator: it exists for storage tests that need to inject some
+    high level types of storage work via the postgres interface:
+    - layer writes (`write_rows`)
+    - work for compaction (`churn_rows`)
+    - reads, checking we get the right data (`validate`)
+    """
+
+    def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
+        self.env = env
+        self.tenant_id = tenant_id
+        self.timeline_id = timeline_id
+        self.table = "foo"
+
+        self.expect_rows = 0
+        self.churn_cursor = 0
+
+        self.endpoints: dict[int, Endpoint] = {}
+
+    def endpoint(self, pageserver_id: int):
+        if pageserver_id not in self.endpoints:
+            self.endpoints[pageserver_id] = self.env.endpoints.create(
+                "main",
+                tenant_id=self.tenant_id,
+                pageserver_id=pageserver_id,
+                endpoint_id=f"ep-{pageserver_id}",
+            )
+
+        endpoint = self.endpoints[pageserver_id]
+        assert not endpoint.running
+        endpoint.start(pageserver_id=pageserver_id)
+        return endpoint
+
+    def init(self, pageserver_id: int):
+        with self.endpoint(pageserver_id) as endpoint:
+            endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);")
+            endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
+            last_flush_lsn_upload(
+                self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
+            )
+
+    def write_rows(self, n, pageserver_id):
+        with self.endpoint(pageserver_id) as endpoint:
+            start = self.expect_rows
+            end = start + n - 1
+            self.expect_rows += n
+            dummy_value = "blah"
+            endpoint.safe_psql(
+                f"""
+                INSERT INTO {self.table} (id, val)
+                SELECT g, '{dummy_value}'
+                FROM generate_series({start}, {end}) g
+                """
+            )
+
+            return last_flush_lsn_upload(
+                self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
+            )
+
+    def churn_rows(self, n, pageserver_id, upload=True):
+        assert self.expect_rows >= n
+
+        max_iters = 10
+        with self.endpoint(pageserver_id) as endpoint:
+            todo = n
+            i = 0
+            while todo > 0:
+                i += 1
+                if i > max_iters:
+                    raise RuntimeError("oops")
+                start = self.churn_cursor % self.expect_rows
+                n_iter = min((self.expect_rows - start), todo)
+                todo -= n_iter
+
+                end = start + n_iter - 1
+
+                log.info(
+                    f"start,end = {start},{end}, cursor={self.churn_cursor}, expect_rows={self.expect_rows}"
+                )
+
+                assert end < self.expect_rows
+
+                self.churn_cursor += n_iter
+                dummy_value = "blah"
+                endpoint.safe_psql_many(
+                    [
+                        f"""
+                    INSERT INTO {self.table} (id, val)
+                    SELECT g, '{dummy_value}'
+                    FROM generate_series({start}, {end}) g
+                    ON CONFLICT (id) DO UPDATE
+                    SET val = EXCLUDED.val
+                    """,
+                        f"VACUUM {self.table}",
+                    ]
+                )
+
+            last_flush_lsn = wait_for_last_flush_lsn(
+                self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
+            )
+            ps_http = self.env.get_pageserver(pageserver_id).http_client()
+            wait_for_last_record_lsn(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
+
+        if upload:
+            # force a checkpoint to trigger upload
+            ps_http.timeline_checkpoint(self.tenant_id, self.timeline_id)
+            wait_for_upload(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
+
+    def validate(self, pageserver_id):
+        with self.endpoint(pageserver_id) as endpoint:
+            result = endpoint.safe_psql_many(
+                [
+                    "select clear_buffer_cache()",
+                    f"""
+                SELECT COUNT(*) FROM {self.table}
+                """,
+                ]
+            )
+
+            log.info(f"validate({self.expect_rows}): {result}")
+            assert result == [[("",)], [(self.expect_rows,)]]
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -4,8 +4,10 @@ First make a release build. The `-s` flag silences a lot of output, and makes it
 easier to see if you have compile errors without scrolling up.
 `BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing" make -s -j8`

+You may also need to run `./scripts/pysync`.
+
 Then run the tests
-`NEON_BIN=./target/release poetry run pytest test_runner/performance"`
+`DEFAULT_PG_VERSION=15 NEON_BIN=./target/release poetry run pytest test_runner/performance`

 Some handy pytest flags for local development:
 - `-x` tells pytest to stop on first error
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -19,7 +19,7 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:

    # eviction might be the first one after an attach to access the layers
    env.pageserver.allowed_errors.append(
-        ".*unexpectedly on-demand downloading remote layer remote.* for task kind Eviction"
+        ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction"
    )
    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
    return env
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -20,7 +20,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):

    env.pageserver.allowed_errors.extend(
        [
-            ".*Failed to load delta layer.*",
+            ".*layer loading failed:.*",
            ".*could not find data for key.*",
            ".*is not active. Current state: Broken.*",
            ".*will not become active. Current state: Broken.*",
@@ -87,7 +87,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
    # Second timeline will fail during basebackup, because the local layer file is corrupt.
    # It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
    # (We don't check layer file contents on startup, when loading the timeline)
-    with pytest.raises(Exception, match="Failed to load delta layer") as err:
+    with pytest.raises(Exception, match="layer loading failed:") as err:
        pg2.start()
    log.info(
        f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -247,34 +247,34 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
    ps_http.evict_all_layers(tenant_id, timeline_id)

    def ensure_resident_and_remote_size_metrics():
-        log.info("ensure that all the layers are gone")
        resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
        # we have disabled all background loops, so, this should hold
-        assert len(resident_layers) == 0
+        assert len(resident_layers) == 0, "ensure that all the layers are gone"

        info = ps_http.layer_map_info(tenant_id, timeline_id)
        log.info("layer map dump: %s", info)

-        log.info("ensure that resident_physical_size metric is zero")
        resident_physical_size_metric = ps_http.get_timeline_metric(
            tenant_id, timeline_id, "pageserver_resident_physical_size"
        )
-        assert resident_physical_size_metric == 0
-        log.info("ensure that resident_physical_size metric corresponds to layer map dump")
+        assert (
+            resident_physical_size_metric == 0
+        ), "ensure that resident_physical_size metric is zero"
        assert resident_physical_size_metric == sum(
-            [layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote]
-        )
+            layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote
+        ), "ensure that resident_physical_size metric corresponds to layer map dump"

-        log.info("ensure that remote_physical_size metric matches layer map")
        remote_physical_size_metric = ps_http.get_timeline_metric(
            tenant_id, timeline_id, "pageserver_remote_physical_size"
        )
-        log.info("ensure that remote_physical_size metric corresponds to layer map dump")
        assert remote_physical_size_metric == sum(
            layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote
-        )
+        ), "ensure that remote_physical_size metric corresponds to layer map dump"

    log.info("before runnning GC, ensure that remote_physical size is zero")
+    # leaving index_part.json upload from successful compaction out will show
+    # up here as a mismatch between remove_physical_size and summed up layermap
+    # size
    ensure_resident_and_remote_size_metrics()

    log.info("run GC")
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -8,6 +8,71 @@ from fixtures.types import Lsn
 from fixtures.utils import query_scalar


+#
+# Test pageserver get_lsn_by_timestamp API
+#
+def test_lsn_mapping_old(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+
+    new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping")
+    endpoint_main = env.endpoints.create_start("test_lsn_mapping")
+    log.info("postgres is running on 'test_lsn_mapping' branch")
+
+    cur = endpoint_main.connect().cursor()
+    # Create table, and insert rows, each in a separate transaction
+    # Disable synchronous_commit to make this initialization go faster.
+    #
+    # Each row contains current insert LSN and the current timestamp, when
+    # the row was inserted.
+    cur.execute("SET synchronous_commit=off")
+    cur.execute("CREATE TABLE foo (x integer)")
+    tbl = []
+    for i in range(1000):
+        cur.execute("INSERT INTO foo VALUES(%s)", (i,))
+        # Get the timestamp at UTC
+        after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=None)
+        tbl.append([i, after_timestamp])
+
+    # Execute one more transaction with synchronous_commit enabled, to flush
+    # all the previous transactions
+    cur.execute("SET synchronous_commit=on")
+    cur.execute("INSERT INTO foo VALUES (-1)")
+
+    # Wait until WAL is received by pageserver
+    wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
+
+    with env.pageserver.http_client() as client:
+        # Check edge cases: timestamp in the future
+        probe_timestamp = tbl[-1][1] + timedelta(hours=1)
+        result = client.timeline_get_lsn_by_timestamp(
+            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
+        )
+        assert result == "future"
+
+        # timestamp too the far history
+        probe_timestamp = tbl[0][1] - timedelta(hours=10)
+        result = client.timeline_get_lsn_by_timestamp(
+            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
+        )
+        assert result == "past"
+
+        # Probe a bunch of timestamps in the valid range
+        for i in range(1, len(tbl), 100):
+            probe_timestamp = tbl[i][1]
+            lsn = client.timeline_get_lsn_by_timestamp(
+                env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
+            )
+            # Call get_lsn_by_timestamp to get the LSN
+            # Launch a new read-only node at that LSN, and check that only the rows
+            # that were supposed to be committed at that point in time are visible.
+            endpoint_here = env.endpoints.create_start(
+                branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn
+            )
+            assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i
+
+            endpoint_here.stop_and_destroy()
+
+
 #
 # Test pageserver get_lsn_by_timestamp API
 #
@@ -45,23 +110,24 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
        # Check edge cases: timestamp in the future
        probe_timestamp = tbl[-1][1] + timedelta(hours=1)
        result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z"
+            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
        )
-        assert result == "future"
+        assert result["kind"] == "future"

        # timestamp too the far history
        probe_timestamp = tbl[0][1] - timedelta(hours=10)
        result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z"
+            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
        )
-        assert result == "past"
+        assert result["kind"] == "past"

        # Probe a bunch of timestamps in the valid range
        for i in range(1, len(tbl), 100):
            probe_timestamp = tbl[i][1]
-            lsn = client.timeline_get_lsn_by_timestamp(
-                env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z"
+            result = client.timeline_get_lsn_by_timestamp(
+                env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
            )
+            lsn = result["lsn"]
            # Call get_lsn_by_timestamp to get the LSN
            # Launch a new read-only node at that LSN, and check that only the rows
            # that were supposed to be committed at that point in time are visible.
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -12,13 +12,12 @@ from fixtures.neon_fixtures import (
    last_flush_lsn_upload,
    wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
    assert_tenant_state,
    wait_for_last_record_lsn,
    wait_for_upload,
    wait_for_upload_queue_empty,
-    wait_until_tenant_state,
 )
 from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
 from fixtures.types import Lsn
@@ -384,7 +383,7 @@ def test_download_remote_layers_api(
    env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"})
    env.pageserver.allowed_errors.extend(
        [
-            f".*download_all_remote_layers.*{tenant_id}.*{timeline_id}.*layer download failed.*remote-storage-download-pre-rename failpoint",
+            ".*download failed: downloading evicted layer file failed.*",
            f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size",
        ]
    )
@@ -637,56 +636,5 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne
    assert dict(kinds_after) == {"Delta": 4, "Image": 1}


-def test_ondemand_download_failure_to_replace(neon_env_builder: NeonEnvBuilder):
-    """
-    Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking.
-
-    See: https://github.com/neondatabase/neon/issues/3533
-    """
-
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-
-    # disable gc and compaction via default tenant config because config is lost while detaching
-    # so that compaction will not be the one to download the layer but the http handler is
-    neon_env_builder.pageserver_config_override = (
-        """tenant_config={gc_period = "0s", compaction_period = "0s"}"""
-    )
-
-    env = neon_env_builder.init_start()
-
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-    assert timeline_id is not None
-
-    pageserver_http = env.pageserver.http_client()
-
-    # remove layers so that they will be redownloaded
-    pageserver_http.tenant_detach(tenant_id)
-    pageserver_http.tenant_attach(tenant_id)
-
-    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
-    pageserver_http.configure_failpoints(("layermap-replace-notfound", "return"))
-
-    # requesting details with non-incremental size should trigger a download of the only layer
-    # this will need to be adjusted if an index for logical sizes is ever implemented
-    with pytest.raises(PageserverApiException):
-        # PageserverApiException is expected because of the failpoint (timeline_detail building does something)
-        # ReadTimeout can happen on our busy CI, but it should not, because there is no more busylooping
-        # but should it be added back, we would wait for 15s here.
-        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=15)
-
-    actual_message = ".* ERROR .*layermap-replace-notfound"
-    assert env.pageserver.log_contains(actual_message) is not None
-    env.pageserver.allowed_errors.append(actual_message)
-
-    env.pageserver.allowed_errors.append(
-        ".* ERROR .*Error processing HTTP request: InternalServerError\\(get local timeline info"
-    )
-    # this might get to run and attempt on-demand, but not always
-    env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'")
-
-    # if the above returned, then we didn't have a livelock, and all is well
-
-
 def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
    return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -24,12 +24,19 @@ from fixtures.neon_fixtures import (
    last_flush_lsn_upload,
    wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.utils import list_prefix
+from fixtures.pageserver.http import PageserverApiException
+from fixtures.pageserver.utils import (
+    assert_tenant_state,
+    list_prefix,
+    wait_for_last_record_lsn,
+    wait_for_upload,
+)
 from fixtures.remote_storage import (
    RemoteStorageKind,
 )
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import print_gc_result, wait_until
+from fixtures.workload import Workload

 # A tenant configuration that is convenient for generating uploads and deletions
 # without a large amount of postgres traffic.
@@ -486,16 +493,20 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):


 def evict_all_layers(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
-    timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
-    initial_local_layers = sorted(
-        list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
-    )
    client = env.pageserver.http_client()
-    for layer in initial_local_layers:
-        if "ephemeral" in layer.name or "temp" in layer.name:
+
+    layer_map = client.layer_map_info(tenant_id, timeline_id)
+
+    for layer in layer_map.historic_layers:
+        if layer.remote:
+            log.info(
+                f"Skipping trying to evict remote layer {tenant_id}/{timeline_id} {layer.layer_file_name}"
+            )
            continue
-        log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}")
-        client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name)
+        log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.layer_file_name}")
+        client.evict_layer(
+            tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.layer_file_name
+        )


 def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
@@ -528,3 +539,91 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
    read_all(env, tenant_id, timeline_id)
    evict_all_layers(env, tenant_id, timeline_id)
    read_all(env, tenant_id, timeline_id)
+
+
+def test_multi_attach(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    neon_env_builder.enable_generations = True
+    neon_env_builder.num_pageservers = 3
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+
+    pageservers = env.pageservers
+    http_clients = list([p.http_client() for p in pageservers])
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # We will intentionally create situations where stale deletions happen from non-latest-generation
+    # nodes when the tenant is multiply-attached
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(
+            [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
+        )
+
+    # Initially, the tenant will be attached to the pageserver a (first is default in our test harness)
+    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
+    _detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
+    with pytest.raises(PageserverApiException):
+        http_clients[1].timeline_detail(tenant_id, timeline_id)
+    with pytest.raises(PageserverApiException):
+        http_clients[2].timeline_detail(tenant_id, timeline_id)
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(pageservers[0].id)
+    workload.write_rows(1000, pageservers[0].id)
+
+    # Attach the tenant to the other two pageservers
+    pageservers[1].tenant_attach(env.initial_tenant)
+    pageservers[2].tenant_attach(env.initial_tenant)
+
+    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[1], tenant_id, "Active"))
+    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[2], tenant_id, "Active"))
+
+    # Now they all have it attached
+    _detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
+    _detail = http_clients[1].timeline_detail(tenant_id, timeline_id)
+    _detail = http_clients[2].timeline_detail(tenant_id, timeline_id)
+
+    # The endpoint can use any pageserver to service its reads
+    for pageserver in pageservers:
+        workload.validate(pageserver.id)
+
+    # If we write some more data, all the nodes can see it, including stale ones
+    wrote_lsn = workload.write_rows(1000, pageservers[0].id)
+    for ps_http in http_clients:
+        wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, wrote_lsn)
+
+    # ...and indeed endpoints can see it via any of the pageservers
+    for pageserver in pageservers:
+        workload.validate(pageserver.id)
+
+    # Prompt all the pageservers, including stale ones, to upload ingested layers to remote storage
+    for ps_http in http_clients:
+        ps_http.timeline_checkpoint(tenant_id, timeline_id)
+        wait_for_upload(ps_http, tenant_id, timeline_id, wrote_lsn)
+
+    # Now, the contents of remote storage will be a set of layers from each pageserver, but with unique
+    # generation numbers
+    # TODO: validate remote storage contents
+
+    # Stop all pageservers
+    for ps in pageservers:
+        ps.stop()
+
+    # Returning to a normal healthy state: all pageservers will start, but only the one most
+    # recently attached via the control plane will re-attach on startup
+    for ps in pageservers:
+        ps.start()
+
+    with pytest.raises(PageserverApiException):
+        _detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
+    with pytest.raises(PageserverApiException):
+        _detail = http_clients[1].timeline_detail(tenant_id, timeline_id)
+    _detail = http_clients[2].timeline_detail(tenant_id, timeline_id)
+
+    # All data we wrote while multi-attached remains readable
+    workload.validate(pageservers[2].id)
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -0,0 +1,468 @@
+import random
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
+from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed
+from fixtures.remote_storage import RemoteStorageKind
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import wait_until
+from fixtures.workload import Workload
+
+# A tenant configuration that is convenient for generating uploads and deletions
+# without a large amount of postgres traffic.
+TENANT_CONF = {
+    # small checkpointing and compaction targets to ensure we generate many upload operations
+    "checkpoint_distance": f"{128 * 1024}",
+    "compaction_target_size": f"{128 * 1024}",
+    "compaction_threshold": "1",
+    # no PITR horizon, we specify the horizon when we request on-demand GC
+    "pitr_interval": "0s",
+    # disable background compaction and GC. We invoke it manually when we want it to happen.
+    "gc_period": "0s",
+    "compaction_period": "0s",
+    # create image layers eagerly, so that GC can remove some layers
+    "image_creation_threshold": "1",
+}
+
+
+def evict_random_layers(
+    rng: random.Random, pageserver: NeonPageserver, tenant_id: TenantId, timeline_id: TimelineId
+):
+    """
+    Evict 50% of the layers on a pageserver
+    """
+    timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
+    initial_local_layers = sorted(
+        list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
+    )
+    client = pageserver.http_client()
+    for layer in initial_local_layers:
+        if "ephemeral" in layer.name:
+            continue
+
+        if rng.choice([True, False]):
+            log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}")
+            client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name)
+
+
+@pytest.mark.parametrize("seed", [1, 2, 3])
+def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
+    """
+    Issue many location configuration changes, ensure that tenants
+    remain readable & we don't get any unexpected errors.  We should
+    have no ERROR in the log, and no 500s in the API.
+
+    The location_config API is intentionally designed so that all destination
+    states are valid, so that we may test it in this way: the API should always
+    work as long as the tenant exists.
+    """
+    neon_env_builder.enable_generations = True
+    neon_env_builder.num_pageservers = 3
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    assert env.attachment_service is not None
+
+    pageservers = env.pageservers
+    list([p.http_client() for p in pageservers])
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # We will make no effort to avoid stale attachments
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(
+            [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
+        )
+
+        # these can happen, if we shutdown at a good time. to be fixed as part of #5172.
+        message = ".*duplicated L1 layer layer=.*"
+        ps.allowed_errors.append(message)
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageservers[0].id)
+    workload.write_rows(256, env.pageservers[0].id)
+
+    # We use a fixed seed to make the test reproducible: we want a randomly
+    # chosen order, but not to change the order every time we run the test.
+    rng = random.Random(seed)
+
+    initial_generation = 1
+    last_state = {
+        env.pageservers[0].id: ("AttachedSingle", initial_generation),
+        env.pageservers[1].id: ("Detached", None),
+        env.pageservers[2].id: ("Detached", None),
+    }
+
+    latest_attached = env.pageservers[0].id
+
+    for _i in range(0, 64):
+        # Pick a pageserver
+        pageserver = rng.choice(env.pageservers)
+
+        # Pick a pseudorandom state
+        modes = [
+            "AttachedSingle",
+            "AttachedMulti",
+            "AttachedStale",
+            "Secondary",
+            "Detached",
+            "_Evictions",
+            "_Restart",
+        ]
+
+        mode = rng.choice(modes)
+
+        last_state_ps = last_state[pageserver.id]
+        if mode == "_Evictions":
+            if last_state_ps[0].startswith("Attached"):
+                log.info(f"Action: evictions on pageserver {pageserver.id}")
+                evict_random_layers(rng, pageserver, tenant_id, timeline_id)
+            else:
+                log.info(
+                    f"Action: skipping evictions on pageserver {pageserver.id}, is not attached"
+                )
+        elif mode == "_Restart":
+            log.info(f"Action: restarting pageserver {pageserver.id}")
+            pageserver.stop()
+            pageserver.start()
+            if last_state_ps[0].startswith("Attached") and latest_attached == pageserver.id:
+                log.info("Entering postgres...")
+                workload.churn_rows(rng.randint(128, 256), pageserver.id)
+                workload.validate(pageserver.id)
+            elif last_state_ps[0].startswith("Attached"):
+                # The `attachment_service` will only re-attach on startup when a pageserver was the
+                # holder of the latest generation: otherwise the pageserver will revert to detached
+                # state if it was running attached with a stale generation
+                last_state[pageserver.id] = ("Detached", None)
+        else:
+            secondary_conf: Optional[Dict[str, Any]] = None
+            if mode == "Secondary":
+                secondary_conf = {"warm": rng.choice([True, False])}
+
+            location_conf: Dict[str, Any] = {
+                "mode": mode,
+                "secondary_conf": secondary_conf,
+                "tenant_conf": {},
+            }
+
+            log.info(f"Action: Configuring pageserver {pageserver.id} to {location_conf}")
+
+            # Select a generation number
+            if mode.startswith("Attached"):
+                if last_state_ps[1] is not None:
+                    if rng.choice([True, False]):
+                        # Move between attached states, staying in the same generation
+                        generation = last_state_ps[1]
+                    else:
+                        # Switch generations, while also jumping between attached states
+                        generation = env.attachment_service.attach_hook(tenant_id, pageserver.id)
+                        latest_attached = pageserver.id
+                else:
+                    generation = env.attachment_service.attach_hook(tenant_id, pageserver.id)
+                    latest_attached = pageserver.id
+            else:
+                generation = None
+
+            location_conf["generation"] = generation
+
+            pageserver.tenant_location_configure(tenant_id, location_conf)
+            last_state[pageserver.id] = (mode, generation)
+
+            if mode.startswith("Attached"):
+                # TODO: a variant of this test that runs background endpoint workloads, as well as
+                # the inter-step workloads.
+
+                workload.churn_rows(
+                    rng.randint(128, 256), pageserver.id, upload=mode != "AttachedStale"
+                )
+                workload.validate(pageserver.id)
+
+    # Attach all pageservers
+    for ps in env.pageservers:
+        location_conf = {"mode": "AttachedMulti", "secondary_conf": None, "tenant_conf": {}}
+        ps.tenant_location_configure(tenant_id, location_conf)
+
+    # Confirm that all are readable
+    for ps in env.pageservers:
+        workload.validate(ps.id)
+
+    # Detach all pageservers
+    for ps in env.pageservers:
+        location_conf = {"mode": "Detached", "secondary_conf": None, "tenant_conf": {}}
+        ps.tenant_location_configure(tenant_id, location_conf)
+
+    # Confirm that all local disk state was removed on detach
+    # TODO
+
+
+def test_live_migration(neon_env_builder: NeonEnvBuilder):
+    """
+    Test the sequence of location states that are used in a live migration.
+    """
+    neon_env_builder.enable_generations = True
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    assert env.attachment_service is not None
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    pageserver_a = env.pageservers[0]
+    pageserver_b = env.pageservers[1]
+
+    initial_generation = 1
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageservers[0].id)
+    workload.write_rows(256, env.pageservers[0].id)
+
+    # Make the destination a secondary location
+    pageserver_b.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "Secondary",
+            "secondary_conf": {"warm": True},
+            "tenant_conf": {},
+        },
+    )
+
+    workload.churn_rows(64, pageserver_a.id, upload=False)
+
+    # Set origin attachment to stale
+    log.info("Setting origin to AttachedStale")
+    pageserver_a.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "AttachedStale",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": initial_generation,
+        },
+        flush_ms=5000,
+    )
+
+    migrated_generation = env.attachment_service.attach_hook(tenant_id, pageserver_b.id)
+    log.info(f"Acquired generation {migrated_generation} for destination pageserver")
+    assert migrated_generation == initial_generation + 1
+
+    # Writes and reads still work in AttachedStale.
+    workload.validate(pageserver_a.id)
+    workload.validate(pageserver_a.id)
+
+    # Ensure that secondary location's timeline directory is populated: we will then
+    # do some more writes on top of that to ensure that the newly attached pageserver
+    # properly makes use of the downloaded layers as well as ingesting WAL to catch up.
+    pageserver_a.http_client().secondary_tenant_upload(tenant_id)
+    pageserver_b.http_client().secondary_tenant_download(tenant_id)
+
+    # Generate some more dirty writes
+    workload.churn_rows(64, pageserver_a.id)
+
+    # Attach the destination
+    log.info("Setting destination to AttachedMulti")
+    pageserver_b.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "AttachedMulti",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": migrated_generation,
+        },
+    )
+
+    # Wait for destination LSN to catch up with origin
+    origin_lsn = pageserver_a.http_client().timeline_detail(tenant_id, timeline_id)[
+        "last_record_lsn"
+    ]
+
+    def caught_up():
+        destination_lsn = pageserver_b.http_client().timeline_detail(tenant_id, timeline_id)[
+            "last_record_lsn"
+        ]
+        log.info(
+            f"Waiting for LSN to catch up: origin {origin_lsn} vs destination {destination_lsn}"
+        )
+        assert destination_lsn >= origin_lsn
+
+    wait_until(100, 0.1, caught_up)
+
+    # The destination should accept writes
+    workload.churn_rows(64, pageserver_b.id)
+
+    # Dual attached: both are readable.
+    workload.validate(pageserver_a.id)
+    workload.validate(pageserver_b.id)
+
+    # Revert the origin to secondary
+    log.info("Setting origin to Secondary")
+    pageserver_a.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "Secondary",
+            "secondary_conf": {"warm": True},
+            "tenant_conf": {},
+        },
+    )
+
+    workload.churn_rows(64, pageserver_b.id)
+
+    # Put the destination into final state
+    pageserver_b.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "AttachedSingle",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": migrated_generation,
+        },
+    )
+
+    workload.churn_rows(64, pageserver_b.id)
+    workload.validate(pageserver_b.id)
+
+
+def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
+    """
+    Inspect local storage on a pageserver to discover which layer files are present.
+
+    :return: list of relative paths to layers, from the timeline root.
+    """
+    timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
+
+    def relative(p: Path) -> Path:
+        return p.relative_to(timeline_path)
+
+    return sorted(
+        list(
+            map(
+                relative,
+                filter(
+                    lambda path: path.name != "metadata"
+                    and "ephemeral" not in path.name
+                    and "temp" not in path.name,
+                    timeline_path.glob("*"),
+                ),
+            )
+        )
+    )
+
+
+def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
+    """
+    Test the overall data flow in secondary mode:
+     - Heatmap uploads from the attached location
+     - Heatmap & layer downloads from the secondary location
+     - Eviction of layers on the attached location results in deletion
+       on the secondary location as well.
+    """
+    neon_env_builder.enable_generations = True
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    assert env.attachment_service is not None
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    ps_attached = env.pageservers[0]
+    ps_secondary = env.pageservers[1]
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageservers[0].id)
+    workload.write_rows(256, ps_attached.id)
+
+    # Configure a secondary location
+    log.info("Setting up secondary location...")
+    ps_secondary.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "Secondary",
+            "secondary_conf": {"warm": True},
+            "tenant_conf": {},
+        },
+    )
+
+    # Explicit upload/download cycle
+    # ==============================
+    log.info("Synchronizing after initial write...")
+    ps_attached.http_client().secondary_tenant_upload(tenant_id)
+
+    ps_secondary.http_client().secondary_tenant_download(tenant_id)
+
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )
+
+    # Make changes on attached pageserver, check secondary downloads them
+    # ===================================================================
+    log.info("Synchronizing after subsequent write...")
+    workload.churn_rows(128, ps_attached.id)
+    ps_attached.http_client().secondary_tenant_upload(tenant_id)
+    ps_secondary.http_client().secondary_tenant_download(tenant_id)
+
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )
+
+    # Do evictions on attached pageserver, check secondary follows along
+    # ==================================================================
+    log.info("Evicting a layer...")
+    layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
+    ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)
+
+    log.info("Synchronizing after eviction...")
+    ps_attached.http_client().secondary_tenant_upload(tenant_id)
+    ps_secondary.http_client().secondary_tenant_download(tenant_id)
+
+    assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )
+
+    # Scrub the remote storage
+    # ========================
+    # This confirms that the scrubber isn't upset by the presence of the heatmap
+    # TODO: depends on `jcsp/scrubber-index-part` branch.
+
+    # Detach secondary and delete tenant
+    # ===================================
+    # This confirms that the heatmap gets cleaned up as well as other normal content.
+    log.info("Detaching secondary location...")
+    ps_secondary.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+        },
+    )
+
+    log.info("Deleting tenant...")
+    tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
+
+    assert_prefix_empty(
+        neon_env_builder,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
+
+
+# def test_secondary_download_loop
+# Configure some short check intervals, and validate that layers are downloaded by secondary
+# without any explicit admin API calls.
+
+# def test_secondary_eviction(neon_env_builder: NeonEnvBuilder):
+#
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -1,5 +1,6 @@
 import json
 import subprocess
+import time
 from typing import Any, List, Optional, Tuple

 import psycopg2
@@ -364,10 +365,14 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):

    pid1 = get_pid(200, "http")["rows"][0]["pid"]

+    time.sleep(0.02)
+
    # query should be on the same connection
    rows = get_pid(200, "http")["rows"]
    assert rows == [{"pid": pid1}]

+    time.sleep(0.02)
+
    # incorrect password should not work
    res = get_pid(400, "foobar")
    assert "password authentication failed for user" in res["message"]
@@ -378,10 +383,14 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):
    pid2 = get_pid(200, "http2")["rows"][0]["pid"]
    assert pid1 != pid2

+    time.sleep(0.02)
+
    # query should be on an existing connection
    pid = get_pid(200, "http2")["rows"][0]["pid"]
    assert pid in [pid1, pid2]

+    time.sleep(0.02)
+
    # old password should not work
    res = get_pid(400, "http")
    assert "password authentication failed for user" in res["message"]
@@ -419,6 +428,7 @@ def test_sql_over_http_pool_idle(static_proxy: NeonProxy):
        )

    pid1 = query(200, GET_CONNECTION_PID_QUERY)["rows"][0]["pid"]
+    time.sleep(0.02)
    query(200, "BEGIN")
    pid2 = query(200, GET_CONNECTION_PID_QUERY)["rows"][0]["pid"]
    assert pid1 != pid2
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -586,7 +586,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
    log.info("sending delete request")
    checkpoint_allowed_to_fail.set()
    env.pageserver.allowed_errors.append(
-        ".* ERROR .*Error processing HTTP request: InternalServerError\\(timeline is Stopping"
+        ".* ERROR .*Error processing HTTP request: InternalServerError\\(The timeline or pageserver is shutting down"
        ".* ERROR .*[Cc]ould not flush frozen layer.*"
    )

@@ -757,12 +757,14 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
        create_thread.join()


-# Regression test for a race condition where L0 layers are compacted before the upload,
-# resulting in the uploading complaining about the file not being found
-# https://github.com/neondatabase/neon/issues/4526
-def test_compaction_delete_before_upload(
+def test_compaction_waits_for_upload(
    neon_env_builder: NeonEnvBuilder,
 ):
+    """
+    Compaction waits for outstanding uploads to complete, so that it avoids deleting layers
+    files that have not yet been uploaded.  This test forces a race between upload and
+    compaction.
+    """
    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)

    env = neon_env_builder.init_start(
@@ -792,50 +794,81 @@ def test_compaction_delete_before_upload(
        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

        # Now make the flushing hang and update one small piece of data
-        client.configure_failpoints(("flush-frozen-pausable", "pause"))
+        client.configure_failpoints(("before-upload-layer-pausable", "pause"))

        endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1")

        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

-    q: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
-    barrier = threading.Barrier(2)
+    checkpoint_result: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
+    compact_result: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
+    compact_barrier = threading.Barrier(2)

    def checkpoint_in_background():
-        barrier.wait()
        try:
+            log.info("Checkpoint starting")
            client.timeline_checkpoint(tenant_id, timeline_id)
-            q.put(None)
+            log.info("Checkpoint complete")
+            checkpoint_result.put(None)
        except PageserverApiException as e:
-            q.put(e)
+            log.info("Checkpoint errored: {e}")
+            checkpoint_result.put(e)

-    create_thread = threading.Thread(target=checkpoint_in_background)
-    create_thread.start()
+    def compact_in_background():
+        compact_barrier.wait()
+        try:
+            log.info("Compaction starting")
+            client.timeline_compact(tenant_id, timeline_id)
+            log.info("Compaction complete")
+            compact_result.put(None)
+        except PageserverApiException as e:
+            log.info("Compaction errored: {e}")
+            compact_result.put(e)
+
+    checkpoint_thread = threading.Thread(target=checkpoint_in_background)
+    checkpoint_thread.start()
+
+    compact_thread = threading.Thread(target=compact_in_background)
+    compact_thread.start()

    try:
-        barrier.wait()
+        # Start the checkpoint, see that it blocks
+        log.info("Waiting to see checkpoint hang...")
+        time.sleep(5)
+        assert checkpoint_result.empty()

-        time.sleep(4)
-        client.timeline_compact(tenant_id, timeline_id)
+        # Start the compaction, see that it finds work to do but blocks
+        compact_barrier.wait()
+        log.info("Waiting to see compaction hang...")
+        time.sleep(5)
+        assert compact_result.empty()

-        client.configure_failpoints(("flush-frozen-pausable", "off"))
+        # This is logged once compaction is started, but before we wait for operations to complete
+        assert env.pageserver.log_contains("compact_level0_phase1 stats available.")

-        conflict = q.get()
+        # Once we unblock uploads the compaction should complete successfully
+        log.info("Disabling failpoint")
+        client.configure_failpoints(("before-upload-layer-pausable", "off"))
+        log.info("Awaiting compaction result")
+        assert compact_result.get(timeout=10) is None
+        log.info("Awaiting checkpoint result")
+        assert checkpoint_result.get(timeout=10) is None

-        assert conflict is None
+    except Exception:
+        # Log the actual failure's backtrace here, before we proceed to join threads
+        log.exception("Failure, cleaning up...")
+        raise
    finally:
-        create_thread.join()
+        compact_barrier.abort()

-    # Add a delay for the uploads to run into either the file not found or the
-    time.sleep(4)
+        checkpoint_thread.join()
+        compact_thread.join()

    # Ensure that this actually terminates
    wait_upload_queue_empty(client, tenant_id, timeline_id)

-    # For now we are hitting this message.
-    # Maybe in the future the underlying race condition will be fixed,
-    # but until then, ensure that this message is hit instead.
-    assert env.pageserver.log_contains(
+    # We should not have hit the error handling path in uploads where the remote file is gone
+    assert not env.pageserver.log_contains(
        "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."
    )

--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -227,9 +227,7 @@ def test_tenant_redownloads_truncated_file_on_startup(

    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)

-    env.pageserver.allowed_errors.append(
-        ".*removing local file .* because it has unexpected length.*"
-    )
+    env.pageserver.allowed_errors.append(".*removing local file .* because .*")

    # FIXME: Are these expected?
    env.pageserver.allowed_errors.append(
Author	SHA1	Message	Date
John Spray	0baf91fcac	pageserver: include secondary tenants in disk usage eviction	2023-10-26 20:33:47 +01:00
John Spray	cd27f42839	pageserver: pass TenantManager into disk usage eviction task	2023-10-26 20:33:47 +01:00
John Spray	edf303ff44	pageserver: add Layer::for_secondary	2023-10-26 20:33:47 +01:00
John Spray	ec189338eb	tests: avoid deprecated log.warn()	2023-10-26 20:33:47 +01:00
John Spray	e22f9b9fbb	tests: more logging from attachment_service	2023-10-26 20:33:47 +01:00
John Spray	83fc486636	tests: log in UTC	2023-10-26 20:33:47 +01:00
John Spray	3afec731dc	pageserver: add a CancellationToken to Tenant	2023-10-26 20:33:47 +01:00
John Spray	559611f9a6	pageserver: add Tenant::heatmap_hook	2023-10-26 20:33:47 +01:00
John Spray	d56dad3824	tests: add test_pageserver_secondary	2023-10-26 20:33:47 +01:00
John Spray	547acb6653	pageserver: create timelines/ dir when configuring secondary location	2023-10-26 20:33:47 +01:00
John Spray	5264ffc2a9	pageserver/http: add testing routes for secondary mode	2023-10-26 20:33:47 +01:00
John Spray	ee841708ff	pageserver: implement flush on location conf update	2023-10-26 20:33:47 +01:00
John Spray	f356df1860	pageserver: more logging during tenant shutdown	2023-10-26 20:33:47 +01:00
John Spray	fbe1da2981	tests: add helpers for location_conf	2023-10-26 20:33:47 +01:00
John Spray	22c94d99f5	tests: refactor a path helper	2023-10-26 20:33:47 +01:00
John Spray	6de414b5a5	tests: add multi attach test	2023-10-26 20:33:47 +01:00
John Spray	8c0ce9723a	Refactor Workload into shared location	2023-10-26 20:33:47 +01:00
John Spray	5a4c371e94	pageserver: launch tasks for secondary mode	2023-10-26 20:33:47 +01:00
John Spray	65096ac992	pageserver: add secondary downloader & heatmaps	2023-10-26 20:33:47 +01:00
John Spray	049cb1fb4b	Add Timeline::generate_heatmap, remote client heatmap upload	2023-10-26 20:33:47 +01:00
John Spray	640350a2c0	TenantManager: implement hooks for secondary downloads	2023-10-26 20:33:47 +01:00
John Spray	d422105d88	pageserver: start refactoring into TenantManager	2023-10-26 11:14:24 +01:00
John Spray	ead1931167	pageserver: add InProgress top level state & make TenantsMap lock synchronous	2023-10-26 11:13:16 +01:00
John Spray	e0ebdfc7ce	pageserver: suppress compaction/gc errors while stopping (#5670 ) ## Problem Tenant deletions would sometimes be accompanied by compaction stack traces, because `shutdown()` puts the tenant into stopping state before it joins background tasks. ## Summary of changes Treat GC+Compaction as no-ops on a Stopping tenant.	2023-10-26 10:59:24 +01:00
Joonas Koivunen	c508d3b5fa	reimpl Layer, remove remote layer, trait Layer, trait PersistentLayer (#4938 ) Implement a new `struct Layer` abstraction which manages downloadness internally, requiring no LayerMap locking or rewriting to download or evict providing a property "you have a layer, you can read it". The new `struct Layer` provides ability to keep the file resident via a RAII structure for new layers which still need to be uploaded. Previous solution solved this `RemoteTimelineClient::wait_completion` which lead to bugs like #5639. Evicting or the final local deletion after garbage collection is done using Arc'd value `Drop`. With a single `struct Layer` the closed open ended `trait Layer`, `trait PersistentLayer` and `struct RemoteLayer` are removed following noting that compaction could be simplified by simply not using any of the traits in between: #4839. The new `struct Layer` is a preliminary to remove `Timeline::layer_removal_cs` documented in #4745. Preliminaries: #4936, #4937, #5013, #5014, #5022, #5033, #5044, #5058, #5059, #5061, #5074, #5103, epic #5172, #5645, #5649. Related split off: #5057, #5134.	2023-10-26 12:36:38 +03:00
John Spray	acda65d7d4	pageserver: quieten "Failed to get info about AUX files" (#5669 ) ## Problem This line caused lots of errors to be emitted for healthy tenants. ## Summary of changes Downgrade to debug, since it is an expected code path we'll take for tenants at startup.	2023-10-26 09:53:18 +01:00
dependabot[bot]	378daa358b	build(deps): bump werkzeug from 2.2.3 to 3.0.1 (#5665 )	2023-10-25 22:50:35 +00:00
Alexander Bayandin	85f4514e7d	Get env var for real Azure tests from GitHub (#5662 ) ## Problem We'll need to switch `REMOTE_STORAGE_AZURE_REGION` from the current `eastus2` region to something `eu-central-1`-like. This may require changing `AZURE_STORAGE_ACCESS_KEY`. To make it possible to switch from one place (not to break a lot of builds on CI), move `REMOTE_STORAGE_AZURE_CONTAINER` and `REMOTE_STORAGE_AZURE_REGION` to GitHub Variables. See https://github.com/neondatabase/neon/settings/variables/actions ## Summary of changes - Get values for `REMOTE_STORAGE_AZURE_CONTAINER` & `REMOTE_STORAGE_AZURE_REGION` from GitHub Variables	2023-10-25 22:54:23 +01:00
Joonas Koivunen	f70019797c	refactor(rtc): schedule compaction update (#5649 ) a single operation instead of N uploads and 1 deletion scheduling with write(layer_map) lock releasing in the between. Compaction update will make for a much better place to change how the operation will change in future compared to more general file based operations. builds upon #5645. solves the problem of difficult to see hopeful correctness w.r.t. other `index_part.json` changing operations. Co-authored-by: Shany Pozin <shany@neon.tech>	2023-10-25 22:25:43 +01:00
Joonas Koivunen	325258413a	fix: trampling on global physical size metric (#5663 ) All loading (attached, or from disk) timelines overwrite the global gauge for physical size. The `_set` method cannot be used safely, so remove it and just "add" the physical size.	2023-10-25 19:29:12 +01:00
Konstantin Knizhnik	4ddbc0e46d	Ignore missed AUX_FILES_KEY when generating image layer (#5660 ) ## Problem Logical replication requires new AUX_FILES_KEY which is definitely absent in existed database. We do not have function to check if key exists in our KV storage. So I have to handle the error in `list_aux_files` method. But this key is also included in key space range and accessed y `create_image_layer` method. ## Summary of changes Check if AUX_FILES_KEY exists before including it in keyspace. --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech> Co-authored-by: Shany Pozin <shany@neon.tech> Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>	2023-10-25 18:35:23 +01:00
Arpad Müller	a673e4e7a9	Optionally return json from get_lsn_by_timestamp (#5608 ) This does two things: first a minor refactor to not use HTTP/1.x style header names and also to not panic if some certain requests had no "Accept" header. As a second thing, it addresses the third bullet point from #3689: > Change `get_lsn_by_timestamp` API method to return LSN even if we only found commit before the specified timestamp. This is done by adding a version parameter to the `get_lsn_by_timestamp` API call and making its behaviour depend on the version number. Part of #3414 (but doesn't address it in its entirety). --------- Co-authored-by: Joonas Koivunen <joonas@neon.tech>	2023-10-25 18:46:34 +02:00
bojanserafimov	c155cc0c3f	Fix test instructions readme (#5644 )	2023-10-25 11:53:04 -04:00
Conrad Ludgate	32126d705b	proxy refactor serverless (#4685 ) ## Problem Our serverless backend was a bit jumbled. As a comment indicated, we were handling SQL-over-HTTP in our `websocket.rs` file. I've extracted out the `sql_over_http` and `websocket` files from the `http` module and put them into a new module called `serverless`. ## Summary of changes ```sh mkdir proxy/src/serverless mv proxy/src/http/{conn_pool,sql_over_http,websocket}.rs proxy/src/serverless/ mv proxy/src/http/server.rs proxy/src/http/health_server.rs mv proxy/src/metrics proxy/src/usage_metrics.rs ``` I have also extracted the hyper server and handler from websocket.rs into `serverless.rs`	2023-10-25 15:43:03 +01:00
John Spray	5683ae9eab	pageserver: suppress some of the most common spurious warnings (#5658 ) Two of the most common spurious log messages: - broker connections terminate & we log at error severity. Unfortunately tonic gives us an "Unknown" error so to suppress these we're doing string matching. It's hacky but worthwhile for operations. - the first iteration of tenant background tasks tends to over-run its schedule and emit a warning. Ultimately we should fix these to run on time, but for now we are not benefiting from polluting our logs with the warnings.	2023-10-25 14:55:37 +01:00
Alexander Bayandin	4778b6a12e	Switch to querying new tests results DB (#5616 ) ## Problem We started to store test results in a new format in https://github.com/neondatabase/neon/pull/4549. This PR switches scripts to query this db. (we can completely remove old DB/ingestions scripts in a couple of weeks after the PR merged) ## Summary of changes - `scripts/benchmark_durations.py` query new database - `scripts/flaky_tests.py` query new database	2023-10-25 14:25:13 +01:00
John Spray	8b8be7bed4	tests: don't fail tests on torn log lines (#5655 ) ## Problem Tests that force-kill and restart a service can generate torn log lines that might match WARN\|ERROR, but not match the allow expression that a test has loaded, e.g. https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5651/6638398772/index.html#suites/7538959189f4501983ddd9e167836c8b/d272ba8a73e6945c ## Summary of changes Ignore log lines which match a regex for torn log lines on restart: they have two timestamps and the second line is an "INFO version"... message.	2023-10-25 13:29:30 +01:00
Conrad Ludgate	a461c459d8	fix http pool test (#5653 ) ## Problem We defer the returning of connections the the connection pool. It's possible for our test to be faster than the returning of connections - which then gets a differing process ID because it opens a new connection. ## Summary of changes 1. Delay the tests just a little (20ms) to give more chance for connections to return. 2. Correlate connection IDs with the connection logs a bit more	2023-10-25 13:20:45 +01:00
Joonas Koivunen	4ae2d1390d	refactor(remote_timeline_client): Split deletion into unlinking + deletion (#5645 ) Quest: #4745. Prerequisite for #4938. Original https://github.com/neondatabase/neon/pull/4938#issuecomment-1777150665. The new Layer implementation has so far been using `RemoteTimelineClient::schedule_layer_file_deletion` from `Layer::drop` but it was noticed that this could mean that the L0s compaction wanted to remove could linger in the index part for longer time or be left there for longer time. Solution is to split the `RemoteTimelineClient::schedule_layer_file_deletion` into two parts: - unlinking from index_part.json, to be called from end of compaction and gc - scheduling of actual deletions, to be called from `Layer::drop` The added methods are added unused.	2023-10-25 15:01:19 +03:00
Joonas Koivunen	c5949e1fd6	misc smaller improvements (#5527 ) - finally add an `#[instrument]` to Timeline::create_image_layers, making it easier to see that something is happening because we create image layers - format some macro context code - add a warning not to create new validation functions a la parse do not validate Split off from #5198.	2023-10-25 14:59:43 +03:00
John Spray	127837abb0	tests: de-flake test_eviction_across_generations (#5650 ) ## Problem There was an edge case where initial logical size calculation can be downloading a layer that wasn't hit by the test's `SELECT`, and it's on-disk but still marked as remote in the pageserver's internal state, so evicting it fails. https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5648/6630099807/index.html#categories/dee044ec96f666edb90a77c01099a941/e38e97a2735ffa8c/ ## Summary of changes Use pageserver API to learn about layers, instead of inspecting local disk, so that we will always agree with the pageserver about which layer are local.	2023-10-25 10:55:45 +01:00
Conrad Ludgate	b2c96047d0	move wake compute after the auth quirks logic (#5642 ) ## Problem https://github.com/neondatabase/neon/issues/5568#issuecomment-1777015606 ## Summary of changes Make the auth_quirks_creds return the authentication information, and push the wake_compute loop to after, inside `auth_quirks`	2023-10-25 08:30:47 +01:00
Em Sharnoff	44202eeb3b	Bump vm-builder v0.18.1 -> v0.18.2 (#5646 ) Only applicable change was neondatabase/autoscaling#571, removing the postgres_exporter flags `--auto-discover-databases` and `--exclude-databases=...`	2023-10-24 16:04:28 -07:00
Arpad Müller	4bef977c56	Use tuples instead of manual comparison chain (#5637 ) Makes code a little bit simpler	2023-10-24 17:16:23 +00:00
John Spray	a0b862a8bd	pageserver: schedule frozen layer uploads inside the layers lock (#5639 ) ## Problem Compaction's source of truth for what layers exist is the LayerManager. `flush_frozen_layer` updates LayerManager before it has scheduled upload of the frozen layer. Compaction can then "see" the new layer, decide to delete it, schedule uploads of replacement layers, all before `flush_frozen_layer` wakes up again and schedules the upload. When the upload is scheduled, the local layer file may be gone, in which case we end up with no such layer in remote storage, but an entry still added to IndexPart pointing to the missing layer. ## Summary of changes Schedule layer uploads inside the `self.layers` lock, so that whenever a frozen layer is present in LayerManager, it is also present in RemoteTimelineClient's metadata. Closes: #5635	2023-10-24 13:57:01 +01:00
Conrad Ludgate	767ef29390	proxy: filter out more quota exceeded errors (#5640 ) ## Problem Looking at logs, I saw more retries being performed for other quota exceeded errors ## Summary of changes Filter out all quota exceeded family of errors	2023-10-24 13:13:23 +01:00