Merge pull request #5778 from neondatabase/releases/2023-11-03

Release 2023-11-03
Merge branch 'release' into releases/2023-11-03
2026-05-20 22:50:38 +00:00 · 2023-11-03 16:06:58 +01:00 · 2023-11-03 11:18:15 +01:00 · 2023-11-03 08:30:58 +00:00 · 2023-11-02 19:05:33 +00:00 · 2023-11-02 16:26:15 +00:00
41 changed files with 1739 additions and 1541 deletions
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,7 +2,7 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 7 * * 2'
+    - cron: '0 7 * * 5'
  workflow_dispatch:

 jobs:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3550,7 +3550,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3563,7 +3563,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
 dependencies = [
 "native-tls",
 "tokio",
@@ -3574,7 +3574,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3592,7 +3592,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -5396,7 +5396,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
 dependencies = [
 "async-trait",
 "byteorder",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -161,11 +161,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -202,7 +202,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }

 ################# Binary contents sections

--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -278,32 +278,26 @@ fn main() -> Result<()> {
        if #[cfg(target_os = "linux")] {
            use std::env;
            use tokio_util::sync::CancellationToken;
-            use tracing::warn;
-            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
+            let vm_monitor_addr = matches
+                .get_one::<String>("vm-monitor-addr")
+                .expect("--vm-monitor-addr should always be set because it has a default arg");
            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
            let cgroup = matches.get_one::<String>("cgroup");
-            let file_cache_on_disk = matches.get_flag("file-cache-on-disk");

            // Only make a runtime if we need to.
            // Note: it seems like you can make a runtime in an inner scope and
            // if you start a task in it it won't be dropped. However, make it
            // in the outermost scope just to be safe.
-            let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
-                (None, None) => None,
-                (None, Some(_)) => {
-                    warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
-                    None
-                }
-                (Some(_), None) => {
-                    panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
-                }
-                (Some(_), Some(_)) => Some(
+            let rt = if env::var_os("AUTOSCALING").is_some() {
+                Some(
                    tokio::runtime::Builder::new_multi_thread()
                        .worker_threads(4)
                        .enable_all()
                        .build()
-                        .expect("failed to create tokio runtime for monitor"),
-                ),
+                        .expect("failed to create tokio runtime for monitor")
+                )
+            } else {
+                None
            };

            // This token is used internally by the monitor to clean up all threads
@@ -314,8 +308,7 @@ fn main() -> Result<()> {
                    Box::leak(Box::new(vm_monitor::Args {
                        cgroup: cgroup.cloned(),
                        pgconnstr: file_cache_connstr.cloned(),
-                        addr: vm_monitor_addr.cloned().unwrap(),
-                        file_cache_on_disk,
+                        addr: vm_monitor_addr.clone(),
                    })),
                    token.clone(),
                ))
@@ -487,6 +480,8 @@ fn cli() -> clap::Command {
                .value_name("FILECACHE_CONNSTR"),
        )
        .arg(
+            // DEPRECATED, NO LONGER DOES ANYTHING.
+            // See https://github.com/neondatabase/cloud/issues/7516
            Arg::new("file-cache-on-disk")
                .long("file-cache-on-disk")
                .action(clap::ArgAction::SetTrue),
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -24,7 +24,7 @@ fn do_control_plane_request(
 ) -> Result<ControlPlaneSpecResponse, (bool, String)> {
    let resp = reqwest::blocking::Client::new()
        .get(uri)
-        .header("Authorization", jwt)
+        .header("Authorization", format!("Bearer {}", jwt))
        .send()
        .map_err(|e| {
            (
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -12,6 +12,7 @@ use hyper::{Body, Request, Response};
 use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
+use utils::http::endpoint::request_span;
 use utils::logging::{self, LogFormat};
 use utils::signals::{ShutdownSignals, Signal};

@@ -221,8 +222,25 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
            generation: 0,
        });

-    if attach_req.node_id.is_some() {
+    if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
        tenant_state.generation += 1;
+        tracing::info!(
+            tenant_id = %attach_req.tenant_id,
+            ps_id = %attaching_pageserver,
+            generation = %tenant_state.generation,
+            "issuing",
+        );
+    } else if let Some(ps_id) = tenant_state.pageserver {
+        tracing::info!(
+            tenant_id = %attach_req.tenant_id,
+            %ps_id,
+            generation = %tenant_state.generation,
+            "dropping",
+        );
+    } else {
+        tracing::info!(
+            tenant_id = %attach_req.tenant_id,
+            "no-op: tenant already has no pageserver");
    }
    tenant_state.pageserver = attach_req.node_id;
    let generation = tenant_state.generation;
@@ -240,9 +258,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
    endpoint::make_router()
        .data(Arc::new(State::new(persistent_state)))
-        .post("/re-attach", handle_re_attach)
-        .post("/validate", handle_validate)
-        .post("/attach-hook", handle_attach_hook)
+        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/validate", |r| request_span(r, handle_validate))
+        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
 }

 #[tokio::main]
--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -0,0 +1,108 @@
+# Updating Postgres
+
+## Minor Versions
+
+When upgrading to a new minor version of Postgres, please follow these steps:
+
+_Example: 15.4 is the new minor version to upgrade to from 15.3._
+
+1. Clone the Neon Postgres repository if you have not done so already.
+
+    ```shell
+    git clone git@github.com:neondatabase/postgres.git
+    ```
+
+1. Add the Postgres upstream remote.
+
+    ```shell
+    git remote add upstream https://git.postgresql.org/git/postgresql.git
+    ```
+
+1. Create a new branch based on the stable branch you are updating.
+
+    ```shell
+    git checkout -b my-branch REL_15_STABLE_neon
+    ```
+
+1. Tag the last commit on the stable branch you are updating.
+
+    ```shell
+    git tag REL_15_3_neon
+    ```
+
+1. Push the new tag to the Neon Postgres repository.
+
+    ```shell
+    git push origin REL_15_3_neon
+    ```
+
+1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
+
+1. Rebase the branch you created on the tag and resolve any conflicts.
+
+    ```shell
+    git fetch upstream REL_15_4
+    git rebase REL_15_4
+    ```
+
+1. Run the Postgres test suite to make sure our commits have not affected
+Postgres in a negative way.
+
+    ```shell
+    make check
+    # OR
+    meson test -C builddir
+    ```
+
+1. Push your branch to the Neon Postgres repository.
+
+    ```shell
+    git push origin my-branch
+    ```
+
+1. Clone the Neon repository if you have not done so already.
+
+    ```shell
+    git clone git@github.com:neondatabase/neon.git
+    ```
+
+1. Create a new branch.
+
+1. Change the `revisions.json` file to point at the HEAD of your Postgres
+branch.
+
+1. Update the Git submodule.
+
+    ```shell
+    git submodule set-branch --branch my-branch vendor/postgres-v15
+    git submodule update --remote vendor/postgres-v15
+    ```
+
+1. Run the Neon test suite to make sure that Neon is still good to go on this
+minor Postgres release.
+
+    ```shell
+    ./scripts/poetry -k pg15
+    ```
+
+1. Commit your changes.
+
+1. Create a pull request, and wait for CI to go green.
+
+1. Force push the rebased Postgres branches into the Neon Postgres repository.
+
+    ```shell
+    git push --force origin my-branch:REL_15_STABLE_neon
+    ```
+
+    It may require disabling various branch protections.
+
+1. Update your Neon PR to point at the branches.
+
+    ```shell
+    git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15
+    git commit --amend --no-edit
+    git push --force origin
+    ```
+
+1. Merge the pull request after getting approval(s) and CI completion.
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use strum_macros;
 use utils::{
+    completion,
    generation::Generation,
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
@@ -77,7 +78,12 @@ pub enum TenantState {
    /// system is being shut down.
    ///
    /// Transitions out of this state are possible through `set_broken()`.
-    Stopping,
+    Stopping {
+        // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
+        // otherwise it will not be skipped during deserialization
+        #[serde(skip)]
+        progress: completion::Barrier,
+    },
    /// The tenant is recognized by the pageserver, but can no longer be used for
    /// any operations.
    ///
@@ -987,7 +993,13 @@ mod tests {
                "Activating",
            ),
            (line!(), TenantState::Active, "Active"),
-            (line!(), TenantState::Stopping {}, "Stopping"),
+            (
+                line!(),
+                TenantState::Stopping {
+                    progress: utils::completion::Barrier::default(),
+                },
+                "Stopping",
+            ),
            (
                line!(),
                TenantState::Broken {
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,4 +1,7 @@
-use std::sync::{Arc, Mutex, MutexGuard};
+use std::sync::{
+    atomic::{AtomicUsize, Ordering},
+    Arc, Mutex, MutexGuard,
+};
 use tokio::sync::Semaphore;

 /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
@@ -10,6 +13,7 @@ use tokio::sync::Semaphore;
 /// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
 pub struct OnceCell<T> {
    inner: Mutex<Inner<T>>,
+    initializers: AtomicUsize,
 }

 impl<T> Default for OnceCell<T> {
@@ -17,6 +21,7 @@ impl<T> Default for OnceCell<T> {
    fn default() -> Self {
        Self {
            inner: Default::default(),
+            initializers: AtomicUsize::new(0),
        }
    }
 }
@@ -49,6 +54,7 @@ impl<T> OnceCell<T> {
                init_semaphore: Arc::new(sem),
                value: Some(value),
            }),
+            initializers: AtomicUsize::new(0),
        }
    }

@@ -60,8 +66,8 @@ impl<T> OnceCell<T> {
    /// Initialization is panic-safe and cancellation-safe.
    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
    where
-        F: FnOnce() -> Fut,
-        Fut: std::future::Future<Output = Result<T, E>>,
+        F: FnOnce(InitPermit) -> Fut,
+        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
    {
        let sem = {
            let guard = self.inner.lock().unwrap();
@@ -71,29 +77,61 @@ impl<T> OnceCell<T> {
            guard.init_semaphore.clone()
        };

-        let permit = sem.acquire_owned().await;
-        if permit.is_err() {
-            let guard = self.inner.lock().unwrap();
-            assert!(
-                guard.value.is_some(),
-                "semaphore got closed, must be initialized"
-            );
-            return Ok(Guard(guard));
-        } else {
-            // now we try
-            let value = factory().await?;
+        let permit = {
+            // increment the count for the duration of queued
+            let _guard = CountWaitingInitializers::start(self);
+            sem.acquire_owned().await
+        };

-            let mut guard = self.inner.lock().unwrap();
-            assert!(
-                guard.value.is_none(),
-                "we won permit, must not be initialized"
-            );
-            guard.value = Some(value);
-            guard.init_semaphore.close();
-            Ok(Guard(guard))
+        match permit {
+            Ok(permit) => {
+                let permit = InitPermit(permit);
+                let (value, _permit) = factory(permit).await?;
+
+                let guard = self.inner.lock().unwrap();
+
+                Ok(Self::set0(value, guard))
+            }
+            Err(_closed) => {
+                let guard = self.inner.lock().unwrap();
+                assert!(
+                    guard.value.is_some(),
+                    "semaphore got closed, must be initialized"
+                );
+                return Ok(Guard(guard));
+            }
        }
    }

+    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
+    /// to complete initializing the inner value.
+    ///
+    /// # Panics
+    ///
+    /// If the inner has already been initialized.
+    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
+        let guard = self.inner.lock().unwrap();
+
+        // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
+        // give more permits right now.
+        if guard.init_semaphore.try_acquire().is_ok() {
+            drop(guard);
+            panic!("permit is of wrong origin");
+        }
+
+        Self::set0(value, guard)
+    }
+
+    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
+        if guard.value.is_some() {
+            drop(guard);
+            unreachable!("we won permit, must not be initialized");
+        }
+        guard.value = Some(value);
+        guard.init_semaphore.close();
+        Guard(guard)
+    }
+
    /// Returns a guard to an existing initialized value, if any.
    pub fn get(&self) -> Option<Guard<'_, T>> {
        let guard = self.inner.lock().unwrap();
@@ -103,6 +141,28 @@ impl<T> OnceCell<T> {
            None
        }
    }
+
+    /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
+    pub fn initializer_count(&self) -> usize {
+        self.initializers.load(Ordering::Relaxed)
+    }
+}
+
+/// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the
+/// initializing task for example at the end of initialization.
+struct CountWaitingInitializers<'a, T>(&'a OnceCell<T>);
+
+impl<'a, T> CountWaitingInitializers<'a, T> {
+    fn start(target: &'a OnceCell<T>) -> Self {
+        target.initializers.fetch_add(1, Ordering::Relaxed);
+        CountWaitingInitializers(target)
+    }
+}
+
+impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
+    fn drop(&mut self) {
+        self.0.initializers.fetch_sub(1, Ordering::Relaxed);
+    }
 }

 /// Uninteresting guard object to allow short-lived access to inspect or clone the held,
@@ -135,7 +195,7 @@ impl<'a, T> Guard<'a, T> {
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
+    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
        let mut swapped = Inner::default();
        let permit = swapped
            .init_semaphore
@@ -145,11 +205,14 @@ impl<'a, T> Guard<'a, T> {
        std::mem::swap(&mut *self.0, &mut swapped);
        swapped
            .value
-            .map(|v| (v, permit))
+            .map(|v| (v, InitPermit(permit)))
            .expect("guard is not created unless value has been initialized")
    }
 }

+/// Type held by OnceCell (de)initializing task.
+pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -185,11 +248,11 @@ mod tests {
                    barrier.wait().await;
                    let won = {
                        let g = cell
-                            .get_or_init(|| {
+                            .get_or_init(|permit| {
                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
                                async {
                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
-                                    Ok::<_, Infallible>(i)
+                                    Ok::<_, Infallible>((i, permit))
                                }
                            })
                            .await
@@ -243,7 +306,7 @@ mod tests {
        deinitialization_started.wait().await;

        let started_at = tokio::time::Instant::now();
-        cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
+        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
            .await
            .unwrap();

@@ -258,18 +321,32 @@ mod tests {
        assert_eq!(*cell.get().unwrap(), reinit);
    }

+    #[test]
+    fn reinit_with_deinit_permit() {
+        let cell = Arc::new(OnceCell::new(42));
+
+        let (mol, permit) = cell.get().unwrap().take_and_deinit();
+        cell.set(5, permit);
+        assert_eq!(*cell.get().unwrap(), 5);
+
+        let (five, permit) = cell.get().unwrap().take_and_deinit();
+        assert_eq!(5, five);
+        cell.set(mol, permit);
+        assert_eq!(*cell.get().unwrap(), 42);
+    }
+
    #[tokio::test]
    async fn initialization_attemptable_until_ok() {
        let cell = OnceCell::default();

        for _ in 0..10 {
-            cell.get_or_init(|| async { Err("whatever error") })
+            cell.get_or_init(|_permit| async { Err("whatever error") })
                .await
                .unwrap_err();
        }

        let g = cell
-            .get_or_init(|| async { Ok::<_, Infallible>("finally success") })
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
            .await
            .unwrap();
        assert_eq!(*g, "finally success");
@@ -281,11 +358,11 @@ mod tests {

        let barrier = tokio::sync::Barrier::new(2);

-        let initializer = cell.get_or_init(|| async {
+        let initializer = cell.get_or_init(|permit| async {
            barrier.wait().await;
            futures::future::pending::<()>().await;

-            Ok::<_, Infallible>("never reached")
+            Ok::<_, Infallible>(("never reached", permit))
        });

        tokio::select! {
@@ -298,7 +375,7 @@ mod tests {
        assert!(cell.get().is_none());

        let g = cell
-            .get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
            .await
            .unwrap();
        assert_eq!(*g, "now initialized");
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -21,11 +21,6 @@ pub struct FileCacheState {

 #[derive(Debug)]
 pub struct FileCacheConfig {
-    /// Whether the file cache is *actually* stored in memory (e.g. by writing to
-    /// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
-    /// memory available for the cgroup.
-    pub(crate) in_memory: bool,
-
    /// The size of the file cache, in terms of the size of the resource it consumes
    /// (currently: only memory)
    ///
@@ -59,22 +54,9 @@ pub struct FileCacheConfig {
    spread_factor: f64,
 }

-impl FileCacheConfig {
-    pub fn default_in_memory() -> Self {
+impl Default for FileCacheConfig {
+    fn default() -> Self {
        Self {
-            in_memory: true,
-            // 75 %
-            resource_multiplier: 0.75,
-            // 640 MiB; (512 + 128)
-            min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
-            // ensure any increase in file cache size is split 90-10 with 10% to other memory
-            spread_factor: 0.1,
-        }
-    }
-
-    pub fn default_on_disk() -> Self {
-        Self {
-            in_memory: false,
            resource_multiplier: 0.75,
            // 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
            // memory, the kernel will just evict from its page cache, rather than e.g. killing
@@ -83,7 +65,9 @@ impl FileCacheConfig {
            spread_factor: 0.1,
        }
    }
+}

+impl FileCacheConfig {
    /// Make sure fields of the config are consistent.
    pub fn validate(&self) -> anyhow::Result<()> {
        // Single field validity
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -39,16 +39,6 @@ pub struct Args {
    #[arg(short, long)]
    pub pgconnstr: Option<String>,

-    /// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
-    /// kernel's page cache), and therefore should not count against available memory.
-    //
-    // NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
-    // than a roundabout way, via whether it's on disk), but in order to be backwards compatible
-    // during the switch away from an in-memory file cache, we had to default to the previous
-    // behavior.
-    #[arg(long)]
-    pub file_cache_on_disk: bool,
-
    /// The address we should listen on for connection requests. For the
    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
    #[arg(short, long)]
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -156,10 +156,7 @@ impl Runner {
        // memory limits.
        if let Some(connstr) = &args.pgconnstr {
            info!("initializing file cache");
-            let config = match args.file_cache_on_disk {
-                true => FileCacheConfig::default_on_disk(),
-                false => FileCacheConfig::default_in_memory(),
-            };
+            let config = FileCacheConfig::default();

            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                .await
@@ -187,10 +184,7 @@ impl Runner {
                info!("file cache size actually got set to {actual_size}")
            }

-            if args.file_cache_on_disk {
-                file_cache_disk_size = actual_size;
-            }
-
+            file_cache_disk_size = actual_size;
            state.filecache = Some(file_cache);
        }

@@ -239,17 +233,11 @@ impl Runner {

        let requested_mem = target.mem;
        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let (expected_file_cache_size, expected_file_cache_disk_size) = self
+        let expected_file_cache_size = self
            .filecache
            .as_ref()
-            .map(|file_cache| {
-                let size = file_cache.config.calculate_cache_size(usable_system_memory);
-                match file_cache.config.in_memory {
-                    true => (size, 0),
-                    false => (size, size),
-                }
-            })
-            .unwrap_or((0, 0));
+            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
+            .unwrap_or(0);
        if let Some(cgroup) = &self.cgroup {
            let (last_time, last_history) = *cgroup.watcher.borrow();

@@ -273,7 +261,7 @@ impl Runner {

            let new_threshold = self
                .config
-                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);
+                .cgroup_threshold(usable_system_memory, expected_file_cache_size);

            let current = last_history.avg_non_reclaimable;

@@ -300,13 +288,10 @@ impl Runner {
                .set_file_cache_size(expected_file_cache_size)
                .await
                .context("failed to set file cache size")?;
-            if !file_cache.config.in_memory {
-                file_cache_disk_size = actual_usage;
-            }
+            file_cache_disk_size = actual_usage;
            let message = format!(
-                "set file cache size to {} MiB (in memory = {})",
+                "set file cache size to {} MiB",
                bytes_to_mebibytes(actual_usage),
-                file_cache.config.in_memory,
            );
            info!("downscale: {message}");
            status.push(message);
@@ -357,9 +342,7 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            if !file_cache.config.in_memory {
-                file_cache_disk_size = actual_usage;
-            }
+            file_cache_disk_size = actual_usage;

            if actual_usage != expected_usage {
                warn!(
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -266,7 +266,7 @@ async fn calculate_synthetic_size_worker(
                continue;
            }

-            if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
+            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
                // We can put in some prioritization for consumption metrics.
                // Same for the loop that fetches computed metrics.
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -206,6 +206,7 @@ pub(super) async fn collect_all_metrics(
            None
        } else {
            crate::tenant::mgr::get_tenant(id, true)
+                .await
                .ok()
                .map(|tenant| (id, tenant))
        }
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -57,7 +57,10 @@ impl ControlPlaneClient {

        if let Some(jwt) = &conf.control_plane_api_token {
            let mut headers = hyper::HeaderMap::new();
-            headers.insert("Authorization", jwt.get_contents().parse().unwrap());
+            headers.insert(
+                "Authorization",
+                format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
+            );
            client = client.default_headers(headers);
        }

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -10,6 +10,7 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
+use crate::virtual_file::MaybeFatalIo;
 use crate::virtual_file::VirtualFile;
 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -271,7 +272,9 @@ impl DeletionHeader {
        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
            .await
-            .map_err(Into::into)
+            .maybe_fatal_err("save deletion header")?;
+
+        Ok(())
    }
 }

@@ -360,6 +363,7 @@ impl DeletionList {
        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
            .await
+            .maybe_fatal_err("save deletion list")
            .map_err(Into::into)
    }
 }
--- a/pageserver/src/deletion_queue/check.log
+++ b/pageserver/src/deletion_queue/check.log
@@ -1,2 +0,0 @@
-    Checking pageserver v0.1.0 (/home/neon/neon/pageserver)
-    Finished dev [optimized + debuginfo] target(s) in 7.62s
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,6 +34,8 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::storage_layer::LayerFileName;
+use crate::virtual_file::on_fatal_io_error;
+use crate::virtual_file::MaybeFatalIo;

 // The number of keys in a DeletionList before we will proactively persist it
 // (without reaching a flush deadline).  This aims to deliver objects of the order
@@ -195,7 +197,7 @@ impl ListWriter {
                    debug!("Deletion header {header_path} not found, first start?");
                    Ok(None)
                } else {
-                    Err(anyhow::anyhow!(e))
+                    on_fatal_io_error(&e, "reading deletion header");
                }
            }
        }
@@ -216,16 +218,9 @@ impl ListWriter {
        self.pending.sequence = validated_sequence + 1;

        let deletion_directory = self.conf.deletion_prefix();
-        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
-            Ok(d) => d,
-            Err(e) => {
-                warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
-
-                // Give up: if we can't read the deletion list directory, we probably can't
-                // write lists into it later, so the queue won't work.
-                return Err(e.into());
-            }
-        };
+        let mut dir = tokio::fs::read_dir(&deletion_directory)
+            .await
+            .fatal_err("read deletion directory");

        let list_name_pattern =
            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
@@ -233,7 +228,7 @@ impl ListWriter {
        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
        let mut seqs: Vec<u64> = Vec::new();
-        while let Some(dentry) = dir.next_entry().await? {
+        while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
            let file_name = dentry.file_name();
            let dentry_str = file_name.to_string_lossy();

@@ -246,11 +241,9 @@ impl ListWriter {
                info!("Cleaning up temporary file {dentry_str}");
                let absolute_path =
                    deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
-                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
-                    // Non-fatal error: we will just leave the file behind but not
-                    // try and load it.
-                    warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
-                }
+                tokio::fs::remove_file(&absolute_path)
+                    .await
+                    .fatal_err("delete temp file");

                continue;
            }
@@ -290,7 +283,9 @@ impl ListWriter {
        for s in seqs {
            let list_path = self.conf.deletion_list_path(s);

-            let list_bytes = tokio::fs::read(&list_path).await?;
+            let list_bytes = tokio::fs::read(&list_path)
+                .await
+                .fatal_err("read deletion list");

            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
                Ok(l) => l,
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -28,6 +28,7 @@ use crate::config::PageServerConf;
 use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::control_plane_client::RetryForeverError;
 use crate::metrics;
+use crate::virtual_file::MaybeFatalIo;

 use super::deleter::DeleterMessage;
 use super::DeletionHeader;
@@ -287,16 +288,9 @@ where
    async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
        for list_path in list_paths {
            debug!("Removing deletion list {list_path}");
-
-            if let Err(e) = tokio::fs::remove_file(&list_path).await {
-                // Unexpected: we should have permissions and nothing else should
-                // be touching these files.  We will leave the file behind.  Subsequent
-                // pageservers will try and load it again: hopefully whatever storage
-                // issue (probably permissions) has been fixed by then.
-                tracing::error!("Failed to delete {list_path}: {e:#}");
-                metrics::DELETION_QUEUE.unexpected_errors.inc();
-                break;
-            }
+            tokio::fs::remove_file(&list_path)
+                .await
+                .fatal_err("remove deletion list");
        }
    }

--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -545,7 +545,7 @@ async fn collect_eviction_candidates(
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
+        let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
            Ok(tenant) => tenant,
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -569,7 +569,17 @@ paths:
              schema:
                $ref: "#/components/schemas/NotFoundError"
        "409":
-          description: Tenant download is already in progress
+          description: |
+            The tenant is already known to Pageserver in some way,
+            and hence this `/attach` call has been rejected.
+
+            Some examples of how this can happen:
+            - tenant was created on this pageserver
+            - tenant attachment was started by an earlier call to `/attach`.
+
+            Callers should poll the tenant status's `attachment_status` field,
+            like for status 202. See the longer description for `POST /attach`
+            for details.
          content:
            application/json:
              schema:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -36,8 +36,7 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantMapError, TenantMapInsertError, TenantSlotError,
-    TenantSlotUpsertError, TenantStateError,
+    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
 };
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -148,60 +147,28 @@ impl From<PageReconstructError> for ApiError {
 impl From<TenantMapInsertError> for ApiError {
    fn from(tmie: TenantMapInsertError) -> ApiError {
        match tmie {
-            TenantMapInsertError::SlotError(e) => e.into(),
-            TenantMapInsertError::SlotUpsertError(e) => e.into(),
+            TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
+                ApiError::ResourceUnavailable(format!("{tmie}").into())
+            }
+            TenantMapInsertError::TenantAlreadyExists(id, state) => {
+                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
+            }
+            TenantMapInsertError::TenantExistsSecondary(id) => {
+                ApiError::Conflict(format!("tenant {id} already exists as secondary"))
+            }
            TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
        }
    }
 }

-impl From<TenantSlotError> for ApiError {
-    fn from(e: TenantSlotError) -> ApiError {
-        use TenantSlotError::*;
-        match e {
-            NotFound(tenant_id) => {
-                ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
-            }
-            e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
-            e @ Conflict(_) => ApiError::Conflict(format!("{e}")),
-            InProgress => {
-                ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
-            }
-            MapState(e) => e.into(),
-        }
-    }
-}
-
-impl From<TenantSlotUpsertError> for ApiError {
-    fn from(e: TenantSlotUpsertError) -> ApiError {
-        use TenantSlotUpsertError::*;
-        match e {
-            InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
-            MapState(e) => e.into(),
-        }
-    }
-}
-
-impl From<TenantMapError> for ApiError {
-    fn from(e: TenantMapError) -> ApiError {
-        use TenantMapError::*;
-        match e {
-            StillInitializing | ShuttingDown => {
-                ApiError::ResourceUnavailable(format!("{e}").into())
-            }
-        }
-    }
-}
-
 impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
+            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
            TenantStateError::IsStopping(_) => {
                ApiError::ResourceUnavailable("Tenant is stopping".into())
            }
-            TenantStateError::SlotError(e) => e.into(),
-            TenantStateError::SlotUpsertError(e) => e.into(),
-            TenantStateError::Other(e) => ApiError::InternalServerError(anyhow!(e)),
+            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
        }
    }
 }
@@ -222,7 +189,6 @@ impl From<GetTenantError> for ApiError {
                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
                ApiError::ResourceUnavailable("Tenant not yet active".into())
            }
-            GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
        }
    }
 }
@@ -275,10 +241,8 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
        use crate::tenant::delete::DeleteTenantError::*;
        match value {
            Get(g) => ApiError::from(g),
+            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
            Timeline(t) => ApiError::from(t),
-            NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
-            SlotError(e) => e.into(),
-            SlotUpsertError(e) => e.into(),
            Other(o) => ApiError::InternalServerError(o),
            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
        }
@@ -405,7 +369,7 @@ async fn timeline_create_handler(
    let state = get_state(&request);

    async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
        match tenant.create_timeline(
            new_timeline_id,
            request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -452,7 +416,7 @@ async fn timeline_list_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let response_data = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
        let timelines = tenant.list_timelines();

        let mut response_data = Vec::with_capacity(timelines.len());
@@ -491,7 +455,7 @@ async fn timeline_detail_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -749,7 +713,7 @@ async fn tenant_status(
    check_permission(&request, Some(tenant_id))?;

    let tenant_info = async {
-        let tenant = mgr::get_tenant(tenant_id, false)?;
+        let tenant = mgr::get_tenant(tenant_id, false).await?;

        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -781,7 +745,7 @@ async fn tenant_delete_handler(

    let state = get_state(&request);

-    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id, false)
+    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
        .instrument(info_span!("tenant_delete_handler", %tenant_id))
        .await?;

@@ -812,7 +776,7 @@ async fn tenant_size_handler(
    let headers = request.headers();

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_id, true).await?;

    // this can be long operation
    let inputs = tenant
@@ -1071,7 +1035,7 @@ async fn get_tenant_config_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = mgr::get_tenant(tenant_id, false)?;
+    let tenant = mgr::get_tenant(tenant_id, false).await?;

    let response = HashMap::from([
        (
@@ -1130,7 +1094,7 @@ async fn put_tenant_location_config_handler(
            .await
        {
            match e {
-                TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
+                TenantStateError::NotFound(_) => {
                    // This API is idempotent: a NotFound on a detach is fine.
                }
                _ => return Err(e.into()),
@@ -1168,6 +1132,7 @@ async fn handle_tenant_break(
    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;

    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
+        .await
        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;

    tenant.set_broken("broken from test".to_owned()).await;
@@ -1472,7 +1437,7 @@ async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_id, true).await?;
    tenant
        .get_timeline(timeline_id, true)
        .map_err(|e| ApiError::NotFound(e.into()))
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -14,6 +14,7 @@ use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::Stream;
+use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -54,18 +55,16 @@ use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
-use crate::tenant::mgr::get_active_tenant_with_timeout;
-use crate::tenant::mgr::GetActiveTenantError;
-use crate::tenant::Timeline;
+use crate::tenant::mgr::GetTenantError;
+use crate::tenant::{Tenant, Timeline};
 use crate::trace::Tracer;

 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_secs(10);
-
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -390,7 +389,7 @@ impl PageServerHandler {
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));

        // Make request tracer if needed
-        let tenant = mgr::get_active_tenant_with_timeout(tenant_id, ACTIVE_TENANT_TIMEOUT).await?;
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
        let mut tracer = if tenant.get_trace_read_requests() {
            let connection_id = ConnectionId::generate();
            let path = tenant
@@ -526,7 +525,7 @@ impl PageServerHandler {
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
        // Create empty timeline
        info!("creating new timeline");
-        let tenant = get_active_tenant_with_timeout(tenant_id, ACTIVE_TENANT_TIMEOUT).await?;
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
        let timeline = tenant
            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
            .await?;
@@ -585,7 +584,7 @@ impl PageServerHandler {
        debug_assert_current_span_has_tenant_and_timeline_id();
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));

-        let timeline = get_active_tenant_timeline(tenant_id, timeline_id).await?;
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
        let last_record_lsn = timeline.get_last_record_lsn();
        if last_record_lsn != start_lsn {
            return Err(QueryError::Other(
@@ -793,7 +792,7 @@ impl PageServerHandler {
        let started = std::time::Instant::now();

        // check that the timeline exists
-        let timeline = get_active_tenant_timeline(tenant_id, timeline_id).await?;
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
@@ -1049,7 +1048,7 @@ where
                .record("timeline_id", field::display(timeline_id));

            self.check_permission(Some(tenant_id))?;
-            let timeline = get_active_tenant_timeline(tenant_id, timeline_id).await?;
+            let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;

            let end_of_timeline = timeline.get_last_record_rlsn();

@@ -1233,7 +1232,7 @@ where

            self.check_permission(Some(tenant_id))?;

-            let tenant = get_active_tenant_with_timeout(tenant_id, ACTIVE_TENANT_TIMEOUT).await?;
+            let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                RowDescriptor::int8_col(b"checkpoint_distance"),
                RowDescriptor::int8_col(b"checkpoint_timeout"),
@@ -1279,6 +1278,21 @@ where
    }
 }

+#[derive(thiserror::Error, Debug)]
+enum GetActiveTenantError {
+    #[error(
+        "Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
+    )]
+    WaitForActiveTimeout {
+        latest_state: TenantState,
+        wait_time: Duration,
+    },
+    #[error(transparent)]
+    NotFound(GetTenantError),
+    #[error(transparent)]
+    WaitTenantActive(tenant::WaitToBecomeActiveError),
+}
+
 impl From<GetActiveTenantError> for QueryError {
    fn from(e: GetActiveTenantError) -> Self {
        match e {
@@ -1291,6 +1305,44 @@ impl From<GetActiveTenantError> for QueryError {
    }
 }

+/// Get active tenant.
+///
+/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
+/// ensures that queries don't fail immediately after pageserver startup, because
+/// all tenants are still loading.
+async fn get_active_tenant_with_timeout(
+    tenant_id: TenantId,
+    _ctx: &RequestContext, /* require get a context to support cancellation in the future */
+) -> Result<Arc<Tenant>, GetActiveTenantError> {
+    let tenant = match mgr::get_tenant(tenant_id, false).await {
+        Ok(tenant) => tenant,
+        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
+        Err(GetTenantError::NotActive(_)) => {
+            unreachable!("we're calling get_tenant with active_only=false")
+        }
+        Err(GetTenantError::Broken(_)) => {
+            unreachable!("we're calling get_tenant with active_only=false")
+        }
+    };
+    let wait_time = Duration::from_secs(30);
+    match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
+        Ok(Ok(())) => Ok(tenant),
+        // no .context(), the error message is good enough and some tests depend on it
+        Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
+        Err(_) => {
+            let latest_state = tenant.current_state();
+            if latest_state == TenantState::Active {
+                Ok(tenant)
+            } else {
+                Err(GetActiveTenantError::WaitForActiveTimeout {
+                    latest_state,
+                    wait_time,
+                })
+            }
+        }
+    }
+}
+
 #[derive(Debug, thiserror::Error)]
 enum GetActiveTimelineError {
    #[error(transparent)]
@@ -1312,8 +1364,9 @@ impl From<GetActiveTimelineError> for QueryError {
 async fn get_active_tenant_timeline(
    tenant_id: TenantId,
    timeline_id: TimelineId,
+    ctx: &RequestContext,
 ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
-    let tenant = get_active_tenant_with_timeout(tenant_id, ACTIVE_TENANT_TIMEOUT)
+    let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
        .await
        .map_err(GetActiveTimelineError::Tenant)?;
    let timeline = tenant
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -291,9 +291,6 @@ pub enum TaskKind {
    // A request that comes in via the pageserver HTTP API.
    MgmtRequest,

-    // Tenant Manager servicing a channel that enables upcalls from Tenant
-    TenantManagerUpcall,
-
    DebugTool,

    #[cfg(test)]
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -51,9 +51,10 @@ use self::config::AttachedLocationConfig;
 use self::config::AttachmentMode;
 use self::config::LocationConf;
 use self::config::TenantConf;
-use self::delete::should_resume_deletion;
+use self::delete::DeleteTenantFlow;
 use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
+use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
@@ -249,12 +250,8 @@ pub struct Tenant {
    cached_synthetic_tenant_size: Arc<AtomicU64>,

    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
-}

-impl std::fmt::Debug for Tenant {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{} ({})", self.tenant_id, self.current_state())
-    }
+    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
 }

 pub(crate) enum WalRedoManager {
@@ -349,14 +346,14 @@ impl Debug for DeleteTimelineError {
 }

 pub enum SetStoppingError {
-    AlreadyStopping,
+    AlreadyStopping(completion::Barrier),
    Broken,
 }

 impl Debug for SetStoppingError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
-            Self::AlreadyStopping => f.debug_tuple("AlreadyStopping").finish(),
+            Self::AlreadyStopping(_) => f.debug_tuple("AlreadyStopping").finish(),
            Self::Broken => write!(f, "Broken"),
        }
    }
@@ -412,11 +409,6 @@ enum CreateTimelineCause {
    Delete,
 }

-#[derive(Debug)]
-enum ShutdownError {
-    AlreadyStopping,
-}
-
 impl Tenant {
    /// Yet another helper for timeline initialization.
    ///
@@ -534,8 +526,8 @@ impl Tenant {
        resources: TenantSharedResources,
        attached_conf: AttachedTenantConf,
        init_order: Option<InitializationOrder>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        mode: SpawnMode,
-        resume_deletion_upcall: Option<tokio::sync::mpsc::Sender<Arc<Tenant>>>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
@@ -622,48 +614,67 @@ impl Tenant {
                // Remote preload is complete.
                drop(remote_load_completion);

-                let resume_deletion = should_resume_deletion(conf, preload.as_ref().map(|p| p.deleting).unwrap_or(false), &tenant_id).await?;
-                if resume_deletion {
-                    // We will wait until the background 
-                    info!("Tenant is partially deleted, will resume");
-
-                    // Put the Tenant into a Stopping state so that it will not try to serve any I/O
-                    tenant_clone
-                        .set_stopping(false, true)
-                        .await
-                        .expect("cant be stopping or broken");
-
-                    // Proceed with attaching the tenant to load the metadata we will use for remote deletion
-                    match tenant_clone.attach(init_order, preload, &ctx).await {
-                        Ok(()) => {
-                            info!("attach finished, deletion will resume");
-                        }
-                        Err(e) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(e));
-                        }
-                    }
-
-                    if let Some(resume_deletion_upcall) = resume_deletion_upcall {
-                        // If this send() fails it means we're shutting down, so just drop it on the floor
-                        resume_deletion_upcall.send(tenant_clone).await.ok();
-                    } else {
-                        make_broken(&tenant_clone, anyhow::anyhow!(
-                            "Attemped to attach a partially deleted tenant outside of pageserver startup"
-                        ));
-                    }
-                } else {
-                    // Normal case: load all the metadata for the tenant.
-                    match tenant_clone.attach(init_order, preload, &ctx).await {
-                        Ok(()) => {
-                            info!("attach finished, activating");
-                            tenant_clone.activate(broker_client, None, &ctx);
-                        }
-                        Err(e) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(e));
+                let pending_deletion = {
+                    match DeleteTenantFlow::should_resume_deletion(
+                        conf,
+                        preload.as_ref().map(|p| p.deleting).unwrap_or(false),
+                        &tenant_clone,
+                    )
+                    .await
+                    {
+                        Ok(should_resume_deletion) => should_resume_deletion,
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
                        }
                    }
                };

+                info!("pending_deletion {}", pending_deletion.is_some());
+
+                if let Some(deletion) = pending_deletion {
+                    // as we are no longer loading, signal completion by dropping
+                    // the completion while we resume deletion
+                    drop(_completion);
+                    // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
+                    let _ = init_order
+                        .as_mut()
+                        .and_then(|x| x.initial_logical_size_attempt.take());
+                    let background_jobs_can_start =
+                        init_order.as_ref().map(|x| &x.background_jobs_can_start);
+                    if let Some(background) = background_jobs_can_start {
+                        info!("waiting for backgound jobs barrier");
+                        background.clone().wait().await;
+                        info!("ready for backgound jobs barrier");
+                    }
+
+                    match DeleteTenantFlow::resume_from_attach(
+                        deletion,
+                        &tenant_clone,
+                        preload,
+                        tenants,
+                        init_order,
+                        &ctx,
+                    )
+                    .await
+                    {
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
+                        }
+                        Ok(()) => return Ok(()),
+                    }
+                }
+
+                match tenant_clone.attach(init_order, preload, &ctx).await {
+                    Ok(()) => {
+                        info!("attach finished, activating");
+                        tenant_clone.activate(broker_client, None, &ctx);
+                    }
+                    Err(e) => {
+                        make_broken(&tenant_clone, anyhow::anyhow!(e));
+                    }
+                }
                Ok(())
            }
            .instrument({
@@ -1786,7 +1797,16 @@ impl Tenant {
    /// - detach + ignore (freeze_and_flush == false)
    ///
    /// This will attempt to shutdown even if tenant is broken.
-    async fn shutdown(&self, freeze_and_flush: bool) -> Result<(), ShutdownError> {
+    ///
+    /// `shutdown_progress` is a [`completion::Barrier`] for the shutdown initiated by this call.
+    /// If the tenant is already shutting down, we return a clone of the first shutdown call's
+    /// `Barrier` as an `Err`. This not-first caller can use the returned barrier to join with
+    /// the ongoing shutdown.
+    async fn shutdown(
+        &self,
+        shutdown_progress: completion::Barrier,
+        freeze_and_flush: bool,
+    ) -> Result<(), completion::Barrier> {
        span::debug_assert_current_span_has_tenant_id();
        // Set tenant (and its timlines) to Stoppping state.
        //
@@ -1806,17 +1826,14 @@ impl Tenant {
        // It's mesed up.
        // we just ignore the failure to stop

-        match self.set_stopping(false, false).await {
+        match self.set_stopping(shutdown_progress, false, false).await {
            Ok(()) => {}
            Err(SetStoppingError::Broken) => {
                // assume that this is acceptable
            }
-            Err(SetStoppingError::AlreadyStopping) => {
-                // This should not happen: individual tenant shutdowns are guarded by
-                // `[TenantSlot::InProgress]`, and when we shutdown all tenants during
-                // process shutdown, we just wait for those InProgress ones to finish.
-                error!("Called Tenant::shutdown while already stopping");
-                return Err(ShutdownError::AlreadyStopping);
+            Err(SetStoppingError::AlreadyStopping(other)) => {
+                // give caller the option to wait for this this shutdown
+                return Err(other);
            }
        };

@@ -1857,6 +1874,7 @@ impl Tenant {
    /// `allow_transition_from_attaching` is needed for the special case of attaching deleted tenant.
    async fn set_stopping(
        &self,
+        progress: completion::Barrier,
        allow_transition_from_loading: bool,
        allow_transition_from_attaching: bool,
    ) -> Result<(), SetStoppingError> {
@@ -1873,7 +1891,7 @@ impl Tenant {
                false
            }
            TenantState::Loading => allow_transition_from_loading,
-            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping => true,
+            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
        })
        .await
        .expect("cannot drop self.state while on a &self method");
@@ -1888,21 +1906,21 @@ impl Tenant {
                if !allow_transition_from_attaching {
                    unreachable!("2we ensured above that we're done with activation, and, there is no re-activation")
                };
-                *current_state = TenantState::Stopping;
+                *current_state = TenantState::Stopping { progress };
                true
            }
            TenantState::Loading => {
                if !allow_transition_from_loading {
                    unreachable!("3we ensured above that we're done with activation, and, there is no re-activation")
                };
-                *current_state = TenantState::Stopping;
+                *current_state = TenantState::Stopping { progress };
                true
            }
            TenantState::Active => {
                // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
                // are created after the transition to Stopping. That's harmless, as the Timelines
                // won't be accessible to anyone afterwards, because the Tenant is in Stopping state.
-                *current_state = TenantState::Stopping;
+                *current_state = TenantState::Stopping { progress };
                // Continue stopping outside the closure. We need to grab timelines.lock()
                // and we plan to turn it into a tokio::sync::Mutex in a future patch.
                true
@@ -1914,9 +1932,9 @@ impl Tenant {
                err = Some(SetStoppingError::Broken);
                false
            }
-            TenantState::Stopping  => {
+            TenantState::Stopping { progress } => {
                info!("Tenant is already in Stopping state");
-                err = Some(SetStoppingError::AlreadyStopping);
+                err = Some(SetStoppingError::AlreadyStopping(progress.clone()));
                false
            }
        });
@@ -2092,9 +2110,6 @@ where
 }

 impl Tenant {
-    pub fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
-    }
    pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
        self.tenant_conf.read().unwrap().tenant_conf
    }
@@ -2340,6 +2355,7 @@ impl Tenant {
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
+            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
        }
    }

@@ -4102,7 +4118,7 @@ mod tests {
            make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
            // so that all uploads finish & we can call harness.load() below again
            tenant
-                .shutdown(true)
+                .shutdown(Default::default(), true)
                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
                .await
                .ok()
@@ -4143,7 +4159,7 @@ mod tests {

            // so that all uploads finish & we can call harness.load() below again
            tenant
-                .shutdown(true)
+                .shutdown(Default::default(), true)
                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
                .await
                .ok()
@@ -4205,7 +4221,7 @@ mod tests {
        drop(tline);
        // so that all uploads finish & we can call harness.try_load() below again
        tenant
-            .shutdown(true)
+            .shutdown(Default::default(), true)
            .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
            .await
            .ok()
@@ -4220,7 +4236,11 @@ mod tests {
        metadata_bytes[8] ^= 1;
        std::fs::write(metadata_path, metadata_bytes)?;

-        let err = harness.try_load_local(&ctx).await.expect_err("should fail");
+        let err = harness
+            .try_load_local(&ctx)
+            .await
+            .err()
+            .expect("should fail");
        // get all the stack with all .context, not only the last one
        let message = format!("{err:#}");
        let expected = "failed to load metadata";
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -1,27 +1,31 @@
 use std::sync::Arc;

 use anyhow::Context;
-use camino::Utf8Path;
+use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::TenantState;
 use remote_storage::{GenericRemoteStorage, RemotePath};
+use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::error;
+use tracing::{error, instrument, warn, Instrument, Span};

 use utils::{
-    backoff, crashsafe,
+    backoff, completion, crashsafe, fs_ext,
    id::{TenantId, TimelineId},
 };

 use crate::{
    config::PageServerConf,
-    tenant::{mgr::safe_rename_tenant_dir, ShutdownError},
+    context::RequestContext,
+    task_mgr::{self, TaskKind},
+    InitializationOrder,
 };

 use super::{
-    mgr::{GetTenantError, SlotGuard, TenantSlotError, TenantSlotUpsertError},
+    mgr::{GetTenantError, TenantsMap},
    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
+    span,
    timeline::delete::DeleteTimelineFlow,
-    tree_sort_timelines, DeleteTimelineError, Tenant,
+    tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
 };

 #[derive(Debug, thiserror::Error)]
@@ -29,17 +33,11 @@ pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
    Get(#[from] GetTenantError),

-    #[error("Tenant not attached")]
-    NotAttached,
-
    #[error("Invalid state {0}. Expected Active or Broken")]
    InvalidState(TenantState),

-    #[error("Tenant map slot error {0}")]
-    SlotError(#[from] TenantSlotError),
-
-    #[error("Tenant map slot upsert error {0}")]
-    SlotUpsertError(#[from] TenantSlotUpsertError),
+    #[error("Tenant deletion is already in progress")]
+    AlreadyInProgress,

    #[error("Timeline {0}")]
    Timeline(#[from] DeleteTimelineError),
@@ -48,142 +46,7 @@ pub(crate) enum DeleteTenantError {
    Other(#[from] anyhow::Error),
 }

-/// The part of tenant deletion that must happen before acknowledging the deletion request
-///
-/// Usually deletion requires that the tenant is not already in Stopping state: set
-/// `resume` to true to skip this check if resuming deletion at startup, since we
-/// would have loaded the tenant into Stopping mode already
-pub(crate) async fn delete_tenant_foreground(
-    conf: &'static PageServerConf,
-    remote_storage: Option<GenericRemoteStorage>,
-    tenant: &Arc<Tenant>,
-    resume: bool,
-    // Witness that we are doing this work within a TenantSlot::InProgress state.
-    _slot_guard: &SlotGuard,
-) -> Result<(), DeleteTenantError> {
-    // In resume mode, we must be stopping.  Else, we must _not_ be Stopping.
-    if !(matches!(tenant.current_state(), TenantState::Stopping) ^ !resume) {
-        return Err(DeleteTenantError::InvalidState(tenant.current_state()));
-    }
-
-    fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-create-remote-mark"
-        ))?
-    });
-
-    // Write persistent tombstone
-    if let Some(remote_storage) = &remote_storage {
-        create_remote_delete_mark(conf, remote_storage, &tenant.get_tenant_id()).await?;
-    }
-
-    fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-create-local-mark"
-        ))?
-    });
-
-    create_local_delete_mark(conf, &tenant.tenant_id)
-        .await
-        .context("local create mark")?;
-
-    fail::fail_point!("tenant-delete-before-background", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-background"
-        ))?
-    });
-
-    Ok(())
-}
-
-/// The part of tenant deletion that happens after we have already acknowledged the request.
-///
-/// This part may retried after pageserver restart if we see a tenant that has a deletion marker
-/// but has not been completely deleted.
-pub(crate) async fn delete_tenant_background(
-    conf: &'static PageServerConf,
-    remote_storage: Option<GenericRemoteStorage>,
-    tenant_id: TenantId,
-    tenant: &Arc<Tenant>,
-    resume: bool,
-    // Witness that we are doing this work within a TenantSlot::InProgress state.
-    _slot_guard: &SlotGuard,
-) -> anyhow::Result<()> {
-    let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
-        .await
-        .context("schedule_ordered_timeline_deletions")?;
-
-    fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-polling-ongoing-deletions"
-        ))?
-    });
-
-    // Wait for deletions that were already running at the moment when tenant deletion was requested.
-    // When we can lock deletion guard it means that corresponding timeline deletion finished.
-    for (guard, timeline_id) in already_running_timeline_deletions {
-        let flow = guard.lock().await;
-        if !flow.is_finished() {
-            return Err(DeleteTenantError::Other(anyhow::anyhow!(
-                "already running timeline deletion failed: {timeline_id}"
-            ))
-            .into());
-        }
-    }
-
-    // For convenience, use the Tenant's deletion queue reference so that we don't have
-    // to take it as an  argument.
-    let deletion_queue_client = tenant.deletion_queue_client.clone();
-
-    fail::fail_point!("tenant-delete-before-shutdown", |_| {
-        Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
-    });
-
-    // Tear down local runtime state
-    if !resume {
-        tenant.shutdown(false).await.map_err(|e| match e {
-            ShutdownError::AlreadyStopping => {
-                DeleteTenantError::InvalidState(TenantState::Stopping)
-            }
-        })?;
-    }
-
-    // Not necessary for correctness, but executing deletions before erasing local contents
-    // means that we will have a better chance to resume the delete if we crash.
-    deletion_queue_client.flush_execute().await?;
-
-    fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-deleted-mark"
-        ))?
-    });
-
-    // Remove the deletion marker
-    remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant_id).await?;
-
-    // Not necessary for correctness, but helps make it true that when we log that deletion is
-    // done, it really is.
-    deletion_queue_client.flush_execute().await?;
-
-    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-tenant-dir"
-        ))?
-    });
-
-    // Remove local storage contents.  We do this last, so that if we crash during delete, on
-    // restart we will attempt to re-attach the tenant and resume the deletion.
-    let local_tenant_directory = conf.tenant_path(&tenant_id);
-    let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
-        .await
-        .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-
-    tokio::fs::remove_dir_all(tmp_path.as_path())
-        .await
-        .with_context(|| format!("tenant directory {:?} deletion", tmp_path))?;
-
-    Ok(())
-}
+type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;

 fn remote_tenant_delete_mark_path(
    conf: &PageServerConf,
@@ -198,7 +61,7 @@ fn remote_tenant_delete_mark_path(
    Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
 }

-pub(crate) async fn create_remote_delete_mark(
+async fn create_remote_delete_mark(
    conf: &PageServerConf,
    remote_storage: &GenericRemoteStorage,
    tenant_id: &TenantId,
@@ -243,26 +106,7 @@ async fn create_local_delete_mark(
    Ok(())
 }

-pub(crate) async fn should_resume_deletion(
-    conf: &'static PageServerConf,
-    remote_mark_exists: bool,
-    tenant_id: &TenantId,
-) -> Result<bool, DeleteTenantError> {
-    if remote_mark_exists {
-        return Ok(true);
-    }
-
-    // In the very last stage of deletion, we might have already removed the remote
-    // marker, but be attaching the tenant anyway on the basis of its local directory
-    // existing: to resume deletion in this case we need the local deletion marker.
-    if conf.tenant_deleted_mark_file_path(tenant_id).exists() {
-        Ok(true)
-    } else {
-        Ok(false)
-    }
-}
-
-pub(crate) async fn schedule_ordered_timeline_deletions(
+async fn schedule_ordered_timeline_deletions(
    tenant: &Arc<Tenant>,
 ) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
    // Tenant is stopping at this point. We know it will be deleted.
@@ -300,7 +144,21 @@ pub(crate) async fn schedule_ordered_timeline_deletions(
    Ok(already_running_deletions)
 }

-pub(crate) async fn remove_tenant_remote_delete_mark(
+async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> {
+    // Assert timelines dir is empty.
+    if !fs_ext::is_directory_empty(timelines_path).await? {
+        // Display first 10 items in directory
+        let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
+        let list = &list.into_iter().take(10).collect::<Vec<_>>();
+        return Err(DeleteTenantError::Other(anyhow::anyhow!(
+            "Timelines directory is not empty after all timelines deletion: {list:?}"
+        )));
+    }
+
+    Ok(())
+}
+
+async fn remove_tenant_remote_delete_mark(
    conf: &PageServerConf,
    remote_storage: Option<&GenericRemoteStorage>,
    tenant_id: &TenantId,
@@ -321,3 +179,369 @@ pub(crate) async fn remove_tenant_remote_delete_mark(
    }
    Ok(())
 }
+
+// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
+async fn cleanup_remaining_fs_traces(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let rm = |p: Utf8PathBuf, is_dir: bool| async move {
+        if is_dir {
+            tokio::fs::remove_dir(&p).await
+        } else {
+            tokio::fs::remove_file(&p).await
+        }
+        .or_else(fs_ext::ignore_not_found)
+        .with_context(|| format!("failed to delete {p}"))
+    };
+
+    rm(conf.tenant_config_path(tenant_id), false).await?;
+    rm(conf.tenant_location_config_path(tenant_id), false).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-timelines-dir"
+        ))?
+    });
+
+    rm(conf.timelines_path(tenant_id), true).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-deleted-mark"
+        ))?
+    });
+
+    // Make sure previous deletions are ordered before mark removal.
+    // Otherwise there is no guarantee that they reach the disk before mark deletion.
+    // So its possible for mark to reach disk first and for other deletions
+    // to be reordered later and thus missed if a crash occurs.
+    // Note that we dont need to sync after mark file is removed
+    // because we can tolerate the case when mark file reappears on startup.
+    let tenant_path = &conf.tenant_path(tenant_id);
+    if tenant_path.exists() {
+        crashsafe::fsync_async(&conf.tenant_path(tenant_id))
+            .await
+            .context("fsync_pre_mark_remove")?;
+    }
+
+    rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-tenant-dir"
+        ))?
+    });
+
+    rm(conf.tenant_path(tenant_id), true).await?;
+
+    Ok(())
+}
+
+/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
+/// and deletes its data from both disk and s3.
+/// The sequence of steps:
+/// 1. Upload remote deletion mark.
+/// 2. Create local mark file.
+/// 3. Shutdown tasks
+/// 4. Run ordered timeline deletions
+/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
+/// 6. Remove remote mark
+/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
+/// It is resumable from any step in case a crash/restart occurs.
+/// There are two entrypoints to the process:
+/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
+/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
+///  Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
+#[derive(Default)]
+pub enum DeleteTenantFlow {
+    #[default]
+    NotStarted,
+    InProgress,
+    Finished,
+}
+
+impl DeleteTenantFlow {
+    // These steps are run in the context of management api request handler.
+    // Long running steps are continued to run in the background.
+    // NB: If this fails half-way through, and is retried, the retry will go through
+    // all the same steps again. Make sure the code here is idempotent, and don't
+    // error out if some of the shutdown tasks have already been completed!
+    // NOTE: static needed for background part.
+    // We assume that calling code sets up the span with tenant_id.
+    #[instrument(skip_all)]
+    pub(crate) async fn run(
+        conf: &'static PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
+    ) -> Result<(), DeleteTenantError> {
+        span::debug_assert_current_span_has_tenant_id();
+
+        let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
+
+        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
+            tenant.set_broken(format!("{e:#}")).await;
+            return Err(e);
+        }
+
+        Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
+
+        Ok(())
+    }
+
+    // Helper function needed to be able to match once on returned error and transition tenant into broken state.
+    // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
+    // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
+    // So the solution is to set tenant state to broken.
+    async fn run_inner(
+        guard: &mut OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: Option<&GenericRemoteStorage>,
+        tenant: &Tenant,
+    ) -> Result<(), DeleteTenantError> {
+        guard.mark_in_progress()?;
+
+        fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-remote-mark"
+            ))?
+        });
+
+        // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
+        // Though sounds scary, different mark name?
+        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
+        if let Some(remote_storage) = &remote_storage {
+            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
+                .await
+                .context("remote_mark")?
+        }
+
+        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-local-mark"
+            ))?
+        });
+
+        create_local_delete_mark(conf, &tenant.tenant_id)
+            .await
+            .context("local delete mark")?;
+
+        fail::fail_point!("tenant-delete-before-background", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-background"
+            ))?
+        });
+
+        Ok(())
+    }
+
+    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
+        match self {
+            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
+            Self::InProgress { .. } => { /* We're in a retry */ }
+            Self::NotStarted => { /* Fresh start */ }
+        }
+
+        *self = Self::InProgress;
+
+        Ok(())
+    }
+
+    pub(crate) async fn should_resume_deletion(
+        conf: &'static PageServerConf,
+        remote_mark_exists: bool,
+        tenant: &Tenant,
+    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
+        let acquire = |t: &Tenant| {
+            Some(
+                Arc::clone(&t.delete_progress)
+                    .try_lock_owned()
+                    .expect("we're the only owner during init"),
+            )
+        };
+
+        if remote_mark_exists {
+            return Ok(acquire(tenant));
+        }
+
+        let tenant_id = tenant.tenant_id;
+        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
+        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
+            Ok(acquire(tenant))
+        } else {
+            Ok(None)
+        }
+    }
+
+    pub(crate) async fn resume_from_attach(
+        guard: DeletionGuard,
+        tenant: &Arc<Tenant>,
+        preload: Option<TenantPreload>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        init_order: Option<InitializationOrder>,
+        ctx: &RequestContext,
+    ) -> Result<(), DeleteTenantError> {
+        let (_, progress) = completion::channel();
+
+        tenant
+            .set_stopping(progress, false, true)
+            .await
+            .expect("cant be stopping or broken");
+
+        tenant
+            .attach(init_order, preload, ctx)
+            .await
+            .context("attach")?;
+
+        Self::background(
+            guard,
+            tenant.conf,
+            tenant.remote_storage.clone(),
+            tenants,
+            tenant,
+        )
+        .await
+    }
+
+    async fn prepare(
+        tenants: &tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
+    ) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
+        let m = tenants.read().await;
+
+        let tenant = m
+            .get(&tenant_id)
+            .ok_or(GetTenantError::NotFound(tenant_id))?;
+
+        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
+        // so at least for now allow deletions only for active tenants. TODO recheck
+        // Broken and Stopping is needed for retries.
+        if !matches!(
+            tenant.current_state(),
+            TenantState::Active | TenantState::Broken { .. }
+        ) {
+            return Err(DeleteTenantError::InvalidState(tenant.current_state()));
+        }
+
+        let guard = Arc::clone(&tenant.delete_progress)
+            .try_lock_owned()
+            .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
+
+        fail::fail_point!("tenant-delete-before-shutdown", |_| {
+            Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
+        });
+
+        // make pageserver shutdown not to wait for our completion
+        let (_, progress) = completion::channel();
+
+        // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
+        // i e it is an error to do:
+        // tenant.set_stopping
+        // tenant.shutdown
+        // Its also bad that we're holding tenants.read here.
+        // TODO relax set_stopping to be idempotent?
+        if tenant.shutdown(progress, false).await.is_err() {
+            return Err(DeleteTenantError::Other(anyhow::anyhow!(
+                "tenant shutdown is already in progress"
+            )));
+        }
+
+        Ok((Arc::clone(tenant), guard))
+    }
+
+    fn schedule_background(
+        guard: OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant: Arc<Tenant>,
+    ) {
+        let tenant_id = tenant.tenant_id;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::TimelineDeletionWorker,
+            Some(tenant_id),
+            None,
+            "tenant_delete",
+            false,
+            async move {
+                if let Err(err) =
+                    Self::background(guard, conf, remote_storage, tenants, &tenant).await
+                {
+                    error!("Error: {err:#}");
+                    tenant.set_broken(format!("{err:#}")).await;
+                };
+                Ok(())
+            }
+            .instrument({
+                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
+                span.follows_from(Span::current());
+                span
+            }),
+        );
+    }
+
+    async fn background(
+        mut guard: OwnedMutexGuard<Self>,
+        conf: &PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant: &Arc<Tenant>,
+    ) -> Result<(), DeleteTenantError> {
+        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
+        // Note that if deletion fails we dont mark timelines as broken,
+        // the whole tenant will become broken as by `Self::schedule_background` logic
+        let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
+            .await
+            .context("schedule_ordered_timeline_deletions")?;
+
+        fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-polling-ongoing-deletions"
+            ))?
+        });
+
+        // Wait for deletions that were already running at the moment when tenant deletion was requested.
+        // When we can lock deletion guard it means that corresponding timeline deletion finished.
+        for (guard, timeline_id) in already_running_timeline_deletions {
+            let flow = guard.lock().await;
+            if !flow.is_finished() {
+                return Err(DeleteTenantError::Other(anyhow::anyhow!(
+                    "already running timeline deletion failed: {timeline_id}"
+                )));
+            }
+        }
+
+        let timelines_path = conf.timelines_path(&tenant.tenant_id);
+        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
+        if timelines_path.exists() {
+            // sanity check to guard against layout changes
+            ensure_timelines_dir_empty(&timelines_path)
+                .await
+                .context("timelines dir not empty")?;
+        }
+
+        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
+
+        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
+            ))?
+        });
+
+        cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
+            .await
+            .context("cleanup_remaining_fs_traces")?;
+
+        let mut locked = tenants.write().await;
+        if locked.remove(&tenant.tenant_id).is_none() {
+            warn!("Tenant got removed from tenants map during deletion");
+        };
+
+        *guard = Self::Finished;
+
+        Ok(())
+    }
+}
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -125,6 +125,7 @@ impl Layer {
            let inner = Arc::new(DownloadedLayer {
                owner: owner.clone(),
                kind: tokio::sync::OnceCell::default(),
+                version: 0,
            });
            resident = Some(inner.clone());

@@ -163,6 +164,7 @@ impl Layer {
            let inner = Arc::new(DownloadedLayer {
                owner: owner.clone(),
                kind: tokio::sync::OnceCell::default(),
+                version: 0,
            });
            resident = Some(inner.clone());
            let access_stats = LayerAccessStats::empty_will_record_residence_event_later();
@@ -328,42 +330,46 @@ impl Layer {
 /// read with [`Layer::get_value_reconstruct_data`].
 ///
 /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
+#[derive(Debug)]
 enum ResidentOrWantedEvicted {
    Resident(Arc<DownloadedLayer>),
-    WantedEvicted(Weak<DownloadedLayer>),
+    WantedEvicted(Weak<DownloadedLayer>, usize),
 }

 impl ResidentOrWantedEvicted {
-    fn get(&self) -> Option<Arc<DownloadedLayer>> {
+    fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
        match self {
-            ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
-            ResidentOrWantedEvicted::WantedEvicted(weak) => match weak.upgrade() {
+            ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
+            ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
                Some(strong) => {
                    LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
-                    Some(strong)
+
+                    *self = ResidentOrWantedEvicted::Resident(strong.clone());
+
+                    Some((strong, true))
                }
                None => None,
            },
        }
    }
+
    /// When eviction is first requested, drop down to holding a [`Weak`].
    ///
-    /// Returns `true` if this was the first time eviction was requested.
-    fn downgrade(&mut self) -> &Weak<DownloadedLayer> {
-        let _was_first = match self {
+    /// Returns `Some` if this was the first time eviction was requested. Care should be taken to
+    /// drop the possibly last strong reference outside of the mutex of
+    /// heavier_once_cell::OnceCell.
+    fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
+        match self {
            ResidentOrWantedEvicted::Resident(strong) => {
                let weak = Arc::downgrade(strong);
-                *self = ResidentOrWantedEvicted::WantedEvicted(weak);
-                // returning the weak is not useful, because the drop could had already ran with
-                // the replacement above, and that will take care of cleaning the Option we are in
-                true
+                let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
+                std::mem::swap(self, &mut temp);
+                match temp {
+                    ResidentOrWantedEvicted::Resident(strong) => Some(strong),
+                    ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"),
+                }
            }
-            ResidentOrWantedEvicted::WantedEvicted(_) => false,
-        };
-
-        match self {
-            ResidentOrWantedEvicted::WantedEvicted(ref weak) => weak,
-            _ => unreachable!("just wrote wanted evicted"),
+            ResidentOrWantedEvicted::WantedEvicted(..) => None,
        }
    }
 }
@@ -398,11 +404,17 @@ struct LayerInner {
    /// [`LayerInner::on_downloaded_layer_drop`].
    wanted_evicted: AtomicBool,

-    /// Version is to make sure we will in fact only evict a file if no new download has been
-    /// started.
+    /// Version is to make sure we will only evict a specific download of a file.
+    ///
+    /// Incremented for each download, stored in `DownloadedLayer::version` or
+    /// `ResidentOrWantedEvicted::WantedEvicted`.
    version: AtomicUsize,

    /// Allow subscribing to when the layer actually gets evicted.
+    ///
+    /// If in future we need to implement "wait until layer instances are gone and done", carrying
+    /// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
+    /// method for "wait_gc" which will wait to this being closed.
    status: tokio::sync::broadcast::Sender<Status>,

    /// Counter for exponential backoff with the download
@@ -515,6 +527,14 @@ impl LayerInner {
            .timeline_path(&timeline.tenant_id, &timeline.timeline_id)
            .join(desc.filename().to_string());

+        let (inner, version) = if let Some(inner) = downloaded {
+            let version = inner.version;
+            let resident = ResidentOrWantedEvicted::Resident(inner);
+            (heavier_once_cell::OnceCell::new(resident), version)
+        } else {
+            (heavier_once_cell::OnceCell::default(), 0)
+        };
+
        LayerInner {
            conf,
            path,
@@ -524,12 +544,8 @@ impl LayerInner {
            access_stats,
            wanted_garbage_collected: AtomicBool::new(false),
            wanted_evicted: AtomicBool::new(false),
-            inner: if let Some(inner) = downloaded {
-                heavier_once_cell::OnceCell::new(ResidentOrWantedEvicted::Resident(inner))
-            } else {
-                heavier_once_cell::OnceCell::default()
-            },
-            version: AtomicUsize::new(0),
+            inner,
+            version: AtomicUsize::new(version),
            status: tokio::sync::broadcast::channel(1).0,
            consecutive_failures: AtomicUsize::new(0),
            generation,
@@ -549,6 +565,8 @@ impl LayerInner {
        }
    }

+    /// Cancellation safe, however dropping the future and calling this method again might result
+    /// in a new attempt to evict OR join the previously started attempt.
    pub(crate) async fn evict_and_wait(
        &self,
        _: &RemoteTimelineClient,
@@ -559,20 +577,22 @@ impl LayerInner {

        let mut rx = self.status.subscribe();

-        let res =
-            self.wanted_evicted
-                .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
+        let strong = {
+            match self.inner.get() {
+                Some(mut either) => {
+                    self.wanted_evicted.store(true, Ordering::Relaxed);
+                    either.downgrade()
+                }
+                None => return Err(EvictionError::NotFound),
+            }
+        };

-        if res.is_ok() {
+        if strong.is_some() {
+            // drop the DownloadedLayer outside of the holding the guard
+            drop(strong);
            LAYER_IMPL_METRICS.inc_started_evictions();
        }

-        if self.get().is_none() {
-            // it was not evictable in the first place
-            // our store to the wanted_evicted does not matter; it will be reset by next download
-            return Err(EvictionError::NotFound);
-        }
-
        match rx.recv().await {
            Ok(Status::Evicted) => Ok(()),
            Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
@@ -586,7 +606,8 @@ impl LayerInner {
                //
                // use however late (compared to the initial expressing of wanted) as the
                // "outcome" now
-                match self.get() {
+                LAYER_IMPL_METRICS.inc_broadcast_lagged();
+                match self.inner.get() {
                    Some(_) => Err(EvictionError::Downloaded),
                    None => Ok(()),
                }
@@ -594,17 +615,19 @@ impl LayerInner {
        }
    }

-    /// Should be cancellation safe, but cancellation is troublesome together with the spawned
-    /// download.
+    /// Cancellation safe.
+    #[tracing::instrument(skip_all, fields(layer=%self))]
    async fn get_or_maybe_download(
        self: &Arc<Self>,
        allow_download: bool,
        ctx: Option<&RequestContext>,
    ) -> Result<Arc<DownloadedLayer>, DownloadError> {
+        let mut init_permit = None;
+
        loop {
-            let download = move || async move {
+            let download = move |permit| async move {
                // disable any scheduled but not yet running eviction deletions for this
-                self.version.fetch_add(1, Ordering::Relaxed);
+                let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);

                // no need to make the evict_and_wait wait for the actual download to complete
                drop(self.status.send(Status::Downloaded));
@@ -623,7 +646,11 @@ impl LayerInner {
                    .await
                    .map_err(DownloadError::PreStatFailed)?;

-                if let Some(reason) = needs_download {
+                let permit = if let Some(reason) = needs_download {
+                    if let NeedsDownload::NotFile(ft) = reason {
+                        return Err(DownloadError::NotFile(ft));
+                    }
+
                    // only reset this after we've decided we really need to download. otherwise it'd
                    // be impossible to mark cancelled downloads for eviction, like one could imagine
                    // we would like to do for prefetching which was not needed.
@@ -633,8 +660,6 @@ impl LayerInner {
                        return Err(DownloadError::NoRemoteStorage);
                    }

-                    tracing::debug!(%reason, "downloading layer");
-
                    if let Some(ctx) = ctx {
                        self.check_expected_download(ctx)?;
                    }
@@ -645,16 +670,21 @@ impl LayerInner {
                        return Err(DownloadError::DownloadRequired);
                    }

-                    self.spawn_download_and_wait(timeline).await?;
+                    tracing::info!(%reason, "downloading on-demand");
+
+                    self.spawn_download_and_wait(timeline, permit).await?
                } else {
                    // the file is present locally, probably by a previous but cancelled call to
                    // get_or_maybe_download. alternatively we might be running without remote storage.
                    LAYER_IMPL_METRICS.inc_init_needed_no_download();
-                }
+
+                    permit
+                };

                let res = Arc::new(DownloadedLayer {
                    owner: Arc::downgrade(self),
                    kind: tokio::sync::OnceCell::default(),
+                    version: next_version,
                });

                self.access_stats.record_residence_event(
@@ -662,19 +692,60 @@ impl LayerInner {
                    LayerResidenceEventReason::ResidenceChange,
                );

-                Ok(ResidentOrWantedEvicted::Resident(res))
+                let waiters = self.inner.initializer_count();
+                if waiters > 0 {
+                    tracing::info!(waiters, "completing the on-demand download for other tasks");
+                }
+
+                Ok((ResidentOrWantedEvicted::Resident(res), permit))
            };

-            let locked = self.inner.get_or_init(download).await?;
-
-            if let Some(strong) = Self::get_or_apply_evictedness(Some(locked), &self.wanted_evicted)
-            {
+            if let Some(init_permit) = init_permit.take() {
+                // use the already held initialization permit because it is impossible to hit the
+                // below paths anymore essentially limiting the max loop iterations to 2.
+                let (value, init_permit) = download(init_permit).await?;
+                let mut guard = self.inner.set(value, init_permit);
+                let (strong, _upgraded) = guard
+                    .get_and_upgrade()
+                    .expect("init creates strong reference, we held the init permit");
                return Ok(strong);
            }

-            // the situation in which we might need to retry is that our init was ready
-            // immediatedly, but the DownloadedLayer had been dropped BUT failed to complete
-            // Self::evict_blocking
+            let (weak, permit) = {
+                let mut locked = self.inner.get_or_init(download).await?;
+
+                if let Some((strong, upgraded)) = locked.get_and_upgrade() {
+                    if upgraded {
+                        // when upgraded back, the Arc<DownloadedLayer> is still available, but
+                        // previously a `evict_and_wait` was received.
+                        self.wanted_evicted.store(false, Ordering::Relaxed);
+
+                        // error out any `evict_and_wait`
+                        drop(self.status.send(Status::Downloaded));
+                        LAYER_IMPL_METRICS
+                            .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
+                    }
+
+                    return Ok(strong);
+                } else {
+                    // path to here: the evict_blocking is stuck on spawn_blocking queue.
+                    //
+                    // reset the contents, deactivating the eviction and causing a
+                    // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
+                    locked.take_and_deinit()
+                }
+            };
+
+            // unlock first, then drop the weak, but because upgrade failed, we
+            // know it cannot be a problem.
+
+            assert!(
+                matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
+                "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
+            );
+
+            init_permit = Some(permit);
+
            LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download();
        }
    }
@@ -686,8 +757,8 @@ impl LayerInner {
        match b {
            Download => Ok(()),
            Warn | Error => {
-                tracing::warn!(
-                    "unexpectedly on-demand downloading remote layer {self} for task kind {:?}",
+                tracing::info!(
+                    "unexpectedly on-demand downloading for task kind {:?}",
                    ctx.task_kind()
                );
                crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
@@ -709,14 +780,17 @@ impl LayerInner {
    async fn spawn_download_and_wait(
        self: &Arc<Self>,
        timeline: Arc<Timeline>,
-    ) -> Result<(), DownloadError> {
+        permit: heavier_once_cell::InitPermit,
+    ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
        let task_name = format!("download layer {}", self);

        let (tx, rx) = tokio::sync::oneshot::channel();
+
        // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
        // block tenant::mgr::remove_tenant_from_memory.

        let this: Arc<Self> = self.clone();
+
        crate::task_mgr::spawn(
            &tokio::runtime::Handle::current(),
            crate::task_mgr::TaskKind::RemoteDownloadTask,
@@ -725,6 +799,7 @@ impl LayerInner {
            &task_name,
            false,
            async move {
+
                let client = timeline
                    .remote_client
                    .as_ref()
@@ -746,9 +821,9 @@ impl LayerInner {
                    }
                };

-                if let Err(res) = tx.send(result) {
+                if let Err(res) = tx.send((result, permit)) {
                    match res {
-                        Ok(()) => {
+                        (Ok(()), _) => {
                            // our caller is cancellation safe so this is fine; if someone
                            // else requests the layer, they'll find it already downloaded
                            // or redownload.
@@ -759,7 +834,7 @@ impl LayerInner {
                            tracing::info!("layer file download completed after requester had cancelled");
                            LAYER_IMPL_METRICS.inc_download_completed_without_requester();
                        },
-                        Err(e) => {
+                        (Err(e), _) => {
                            // our caller is cancellation safe, but we might be racing with
                            // another attempt to initialize. before we have cancellation
                            // token support: these attempts should converge regardless of
@@ -775,7 +850,7 @@ impl LayerInner {
            .in_current_span(),
        );
        match rx.await {
-            Ok(Ok(())) => {
+            Ok((Ok(()), permit)) => {
                if let Some(reason) = self
                    .needs_download()
                    .await
@@ -786,10 +861,12 @@ impl LayerInner {
                }

                self.consecutive_failures.store(0, Ordering::Relaxed);
+                tracing::info!("on-demand download successful");

-                Ok(())
+                Ok(permit)
            }
-            Ok(Err(e)) => {
+            Ok((Err(e), _permit)) => {
+                // FIXME: this should be with the spawned task and be cancellation sensitive
                let consecutive_failures =
                    self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
@@ -807,33 +884,6 @@ impl LayerInner {
        }
    }

-    /// Access the current state without waiting for the file to be downloaded.
-    ///
-    /// Requires that we've initialized to state which is respective to the
-    /// actual residency state.
-    fn get(&self) -> Option<Arc<DownloadedLayer>> {
-        let locked = self.inner.get();
-        Self::get_or_apply_evictedness(locked, &self.wanted_evicted)
-    }
-
-    fn get_or_apply_evictedness(
-        guard: Option<heavier_once_cell::Guard<'_, ResidentOrWantedEvicted>>,
-        wanted_evicted: &AtomicBool,
-    ) -> Option<Arc<DownloadedLayer>> {
-        if let Some(mut x) = guard {
-            if let Some(won) = x.get() {
-                // there are no guarantees that we will always get to observe a concurrent call
-                // to evict
-                if wanted_evicted.load(Ordering::Acquire) {
-                    x.downgrade();
-                }
-                return Some(won);
-            }
-        }
-
-        None
-    }
-
    async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
        match tokio::fs::metadata(&self.path).await {
            Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
@@ -853,7 +903,7 @@ impl LayerInner {
    fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> {
        // in future, this should include sha2-256 validation of the file.
        if !m.is_file() {
-            Err(NeedsDownload::NotFile)
+            Err(NeedsDownload::NotFile(m.file_type()))
        } else if m.len() != self.desc.file_size {
            Err(NeedsDownload::WrongSize {
                actual: m.len(),
@@ -867,7 +917,9 @@ impl LayerInner {
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.desc.filename().file_name();

-        let remote = self.get().is_none();
+        // this is not accurate: we could have the file locally but there was a cancellation
+        // and now we are not in sync, or we are currently downloading it.
+        let remote = self.inner.get().is_none();

        let access_stats = self.access_stats.as_api_model(reset);

@@ -896,7 +948,7 @@ impl LayerInner {
    }

    /// `DownloadedLayer` is being dropped, so it calls this method.
-    fn on_downloaded_layer_drop(self: Arc<LayerInner>) {
+    fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
        let gc = self.wanted_garbage_collected.load(Ordering::Acquire);
        let evict = self.wanted_evicted.load(Ordering::Acquire);
        let can_evict = self.have_remote_client;
@@ -904,15 +956,16 @@ impl LayerInner {
        if gc {
            // do nothing now, only in LayerInner::drop
        } else if can_evict && evict {
-            let version = self.version.load(Ordering::Relaxed);
-
-            let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self);
+            let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self, %version);

            // downgrade for queueing, in case there's a tear down already ongoing we should not
            // hold it alive.
            let this = Arc::downgrade(&self);
            drop(self);

+            // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
+            // drop while the `self.inner` is being locked, leading to a deadlock.
+
            crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
                let _g = span.entered();

@@ -922,19 +975,15 @@ impl LayerInner {
                    LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
                    return;
                };
-                this.evict_blocking(version);
+                match this.evict_blocking(version) {
+                    Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
+                    Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
+                }
            });
        }
    }

-    fn evict_blocking(&self, version: usize) {
-        match self.evict_blocking0(version) {
-            Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
-            Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
-        }
-    }
-
-    fn evict_blocking0(&self, version: usize) -> Result<(), EvictionCancelled> {
+    fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
        // deleted or detached timeline, don't do anything.
        let Some(timeline) = self.timeline.upgrade() else {
            return Err(EvictionCancelled::TimelineGone);
@@ -945,32 +994,34 @@ impl LayerInner {
        let _permit = {
            let maybe_downloaded = self.inner.get();

-            if version != self.version.load(Ordering::Relaxed) {
-                // downloadness-state has advanced, we might no longer be the latest eviction
-                // work; don't do anything.
-                //
-                // this is possible to get to by having:
-                //
-                // 1. wanted_evicted.store(true)
-                // 2. ResidentOrWantedEvicted::downgrade
-                // 3. DownloadedLayer::drop
-                // 4. LayerInner::get_or_maybe_download
-                // 5. LayerInner::evict_blocking
-                return Err(EvictionCancelled::VersionCheckFailed);
-            }
-
-            // free the DownloadedLayer allocation
-            match maybe_downloaded.map(|mut g| g.take_and_deinit()) {
-                Some((taken, permit)) => {
-                    assert!(matches!(taken, ResidentOrWantedEvicted::WantedEvicted(_)));
-                    permit
+            let (_weak, permit) = match maybe_downloaded {
+                Some(mut guard) => {
+                    if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard {
+                        if *version == only_version {
+                            guard.take_and_deinit()
+                        } else {
+                            // this was not for us; maybe there's another eviction job
+                            // TODO: does it make any sense to stall here? unique versions do not
+                            // matter, we only want to make sure not to evict a resident, which we
+                            // are not doing.
+                            return Err(EvictionCancelled::VersionCheckFailed);
+                        }
+                    } else {
+                        return Err(EvictionCancelled::AlreadyReinitialized);
+                    }
                }
                None => {
-                    unreachable!("we do the version checking for this exact reason")
+                    // already deinitialized, perhaps get_or_maybe_download did this and is
+                    // currently waiting to reinitialize it
+                    return Err(EvictionCancelled::LostToDownload);
                }
-            }
+            };
+
+            permit
        };

+        // now accesses to inner.get_or_init wait on the semaphore or the `_permit`
+
        self.access_stats.record_residence_event(
            LayerResidenceStatus::Evicted,
            LayerResidenceEventReason::ResidenceChange,
@@ -1003,11 +1054,14 @@ impl LayerInner {
                Ok(())
            }
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                tracing::info!("failed to evict file from disk, it was already gone");
+                tracing::error!(
+                    layer_size = %self.desc.file_size,
+                    "failed to evict layer from disk, it was already gone (metrics will be inaccurate)"
+                );
                Err(EvictionCancelled::FileNotFound)
            }
            Err(e) => {
-                tracing::warn!("failed to evict file from disk: {e:#}");
+                tracing::error!("failed to evict file from disk: {e:#}");
                Err(EvictionCancelled::RemoveFailed)
            }
        };
@@ -1051,6 +1105,8 @@ enum DownloadError {
    ContextAndConfigReallyDeniesDownloads,
    #[error("downloading is really required but not allowed by this method")]
    DownloadRequired,
+    #[error("layer path exists, but it is not a file: {0:?}")]
+    NotFile(std::fs::FileType),
    /// Why no error here? Because it will be reported by page_service. We should had also done
    /// retries already.
    #[error("downloading evicted layer file failed")]
@@ -1066,7 +1122,7 @@ enum DownloadError {
 #[derive(Debug, PartialEq)]
 pub(crate) enum NeedsDownload {
    NotFound,
-    NotFile,
+    NotFile(std::fs::FileType),
    WrongSize { actual: u64, expected: u64 },
 }

@@ -1074,7 +1130,7 @@ impl std::fmt::Display for NeedsDownload {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            NeedsDownload::NotFound => write!(f, "file was not found"),
-            NeedsDownload::NotFile => write!(f, "path is not a file"),
+            NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"),
            NeedsDownload::WrongSize { actual, expected } => {
                write!(f, "file size mismatch {actual} vs. {expected}")
            }
@@ -1085,7 +1141,10 @@ impl std::fmt::Display for NeedsDownload {
 /// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it.
 pub(crate) struct DownloadedLayer {
    owner: Weak<LayerInner>,
+    // Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the
+    // DownloadedLayer
    kind: tokio::sync::OnceCell<anyhow::Result<LayerKind>>,
+    version: usize,
 }

 impl std::fmt::Debug for DownloadedLayer {
@@ -1093,6 +1152,7 @@ impl std::fmt::Debug for DownloadedLayer {
        f.debug_struct("DownloadedLayer")
            // owner omitted because it is always "Weak"
            .field("kind", &self.kind)
+            .field("version", &self.version)
            .finish()
    }
 }
@@ -1100,7 +1160,7 @@ impl std::fmt::Debug for DownloadedLayer {
 impl Drop for DownloadedLayer {
    fn drop(&mut self) {
        if let Some(owner) = self.owner.upgrade() {
-            owner.on_downloaded_layer_drop();
+            owner.on_downloaded_layer_drop(self.version);
        } else {
            // no need to do anything, we are shutting down
        }
@@ -1126,7 +1186,6 @@ impl DownloadedLayer {
                "these are the same, just avoiding the upgrade"
            );

-            // there is nothing async here, but it should be async
            let res = if owner.desc.is_delta {
                let summary = Some(delta_layer::Summary::expected(
                    owner.desc.tenant_id,
@@ -1225,6 +1284,8 @@ impl std::fmt::Debug for ResidentLayer {

 impl ResidentLayer {
    /// Release the eviction guard, converting back into a plain [`Layer`].
+    ///
+    /// You can access the [`Layer`] also by using `as_ref`.
    pub(crate) fn drop_eviction_guard(self) -> Layer {
        self.into()
    }
@@ -1280,7 +1341,7 @@ impl AsRef<Layer> for ResidentLayer {
    }
 }

-/// Allow slimming down if we don't want the `2*usize` with eviction candidates?
+/// Drop the eviction guard.
 impl From<ResidentLayer> for Layer {
    fn from(value: ResidentLayer) -> Self {
        value.owner
@@ -1450,6 +1511,13 @@ impl LayerImplMetrics {
            .unwrap()
            .inc();
    }
+
+    fn inc_broadcast_lagged(&self) {
+        self.rare_counters
+            .get_metric_with_label_values(&["broadcast_lagged"])
+            .unwrap()
+            .inc();
+    }
 }

 enum EvictionCancelled {
@@ -1458,6 +1526,11 @@ enum EvictionCancelled {
    VersionCheckFailed,
    FileNotFound,
    RemoveFailed,
+    AlreadyReinitialized,
+    /// Not evicted because of a pending reinitialization
+    LostToDownload,
+    /// After eviction, there was a new layer access which cancelled the eviction.
+    UpgradedBackOnAccess,
 }

 impl EvictionCancelled {
@@ -1468,6 +1541,9 @@ impl EvictionCancelled {
            EvictionCancelled::VersionCheckFailed => "version_check_fail",
            EvictionCancelled::FileNotFound => "file_not_found",
            EvictionCancelled::RemoveFailed => "remove_failed",
+            EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
+            EvictionCancelled::LostToDownload => "lost_to_download",
+            EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
        }
    }
 }
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -410,7 +410,7 @@ impl DeleteTimelineFlow {
    }

    /// Shortcut to create Timeline in stopping state and spawn deletion task.
-    /// See corresponding parts of [`crate::tenant::timeline::delete::DeleteTimelineFlow`]
+    /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
    #[instrument(skip_all, fields(%timeline_id))]
    pub async fn resume_deletion(
        tenant: Arc<Tenant>,
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -344,7 +344,20 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) {
+        //
+        // It is critical we are responsive to cancellation here. Otherwise, we deadlock with
+        // tenant deletion (holds TENANTS in read mode) any other task that attempts to
+        // acquire TENANTS in write mode before we here call get_tenant.
+        // See https://github.com/neondatabase/neon/issues/5284.
+        let res = tokio::select! {
+            _ = cancel.cancelled() => {
+                return ControlFlow::Break(());
+            }
+            res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => {
+                res
+            }
+        };
+        let tenant = match res {
            Ok(t) => t,
            Err(_) => {
                return ControlFlow::Break(());
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,6 +19,7 @@ use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{RwLock, RwLockWriteGuard};
+use utils::fs_ext;

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -173,37 +174,78 @@ impl OpenFiles {
    }
 }

-#[derive(Debug, thiserror::Error)]
-pub enum CrashsafeOverwriteError {
-    #[error("final path has no parent dir")]
-    FinalPathHasNoParentDir,
-    #[error("remove tempfile")]
-    RemovePreviousTempfile(#[source] std::io::Error),
-    #[error("create tempfile")]
-    CreateTempfile(#[source] std::io::Error),
-    #[error("write tempfile")]
-    WriteContents(#[source] std::io::Error),
-    #[error("sync tempfile")]
-    SyncTempfile(#[source] std::io::Error),
-    #[error("rename tempfile to final path")]
-    RenameTempfileToFinalPath(#[source] std::io::Error),
-    #[error("open final path parent dir")]
-    OpenFinalPathParentDir(#[source] std::io::Error),
-    #[error("sync final path parent dir")]
-    SyncFinalPathParentDir(#[source] std::io::Error),
+/// Identify error types that should alwways terminate the process.  Other
+/// error types may be elegible for retry.
+pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
+    use nix::errno::Errno::*;
+    match e.raw_os_error().map(nix::errno::from_i32) {
+        Some(EIO) => {
+            // Terminate on EIO because we no longer trust the device to store
+            // data safely, or to uphold persistence guarantees on fsync.
+            true
+        }
+        Some(EROFS) => {
+            // Terminate on EROFS because a filesystem is usually remounted
+            // readonly when it has experienced some critical issue, so the same
+            // logic as EIO applies.
+            true
+        }
+        Some(EACCES) => {
+            // Terminate on EACCESS because we should always have permissions
+            // for our own data dir: if we don't, then we can't do our job and
+            // need administrative intervention to fix permissions.  Terminating
+            // is the best way to make sure we stop cleanly rather than going
+            // into infinite retry loops, and will make it clear to the outside
+            // world that we need help.
+            true
+        }
+        _ => {
+            // Treat all other local file I/O errors are retryable.  This includes:
+            // - ENOSPC: we stay up and wait for eviction to free some space
+            // - EINVAL, EBADF, EBADFD: this is a code bug, not a filesystem/hardware issue
+            // - WriteZero, Interrupted: these are used internally VirtualFile
+            false
+        }
+    }
 }
-impl CrashsafeOverwriteError {
-    /// Returns true iff the new contents are durably stored.
-    pub fn are_new_contents_durable(&self) -> bool {
+
+/// Call this when the local filesystem gives us an error with an external
+/// cause: this includes EIO, EROFS, and EACCESS: all these indicate either
+/// bad storage or bad configuration, and we can't fix that from inside
+/// a running process.
+pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
+    tracing::error!("Fatal I/O error: {e}: {context})");
+    std::process::abort();
+}
+
+pub(crate) trait MaybeFatalIo<T> {
+    fn maybe_fatal_err(self, context: &str) -> std::io::Result<T>;
+    fn fatal_err(self, context: &str) -> T;
+}
+
+impl<T> MaybeFatalIo<T> for std::io::Result<T> {
+    /// Terminate the process if the result is an error of a fatal type, else pass it through
+    ///
+    /// This is appropriate for writes, where we typically want to die on EIO/ACCES etc, but
+    /// not on ENOSPC.
+    fn maybe_fatal_err(self, context: &str) -> std::io::Result<T> {
+        if let Err(e) = &self {
+            if is_fatal_io_error(e) {
+                on_fatal_io_error(e, context);
+            }
+        }
+        self
+    }
+
+    /// Terminate the process on any I/O error.
+    ///
+    /// This is appropriate for reads on files that we know exist: they should always work.
+    fn fatal_err(self, context: &str) -> T {
        match self {
-            Self::FinalPathHasNoParentDir => false,
-            Self::RemovePreviousTempfile(_) => false,
-            Self::CreateTempfile(_) => false,
-            Self::WriteContents(_) => false,
-            Self::SyncTempfile(_) => false,
-            Self::RenameTempfileToFinalPath(_) => false,
-            Self::OpenFinalPathParentDir(_) => false,
-            Self::SyncFinalPathParentDir(_) => true,
+            Ok(v) => v,
+            Err(e) => {
+                on_fatal_io_error(&e, context);
+            }
        }
    }
 }
@@ -284,15 +326,13 @@ impl VirtualFile {
        final_path: &Utf8Path,
        tmp_path: &Utf8Path,
        content: &[u8],
-    ) -> Result<(), CrashsafeOverwriteError> {
+    ) -> std::io::Result<()> {
        let Some(final_path_parent) = final_path.parent() else {
-            return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir);
+            return Err(std::io::Error::from_raw_os_error(
+                nix::errno::Errno::EINVAL as i32,
+            ));
        };
-        match std::fs::remove_file(tmp_path) {
-            Ok(()) => {}
-            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
-            Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)),
-        }
+        std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
        let mut file = Self::open_with_options(
            tmp_path,
            OpenOptions::new()
@@ -301,31 +341,20 @@ impl VirtualFile {
                // we bail out instead of causing damage.
                .create_new(true),
        )
-        .await
-        .map_err(CrashsafeOverwriteError::CreateTempfile)?;
-        file.write_all(content)
-            .await
-            .map_err(CrashsafeOverwriteError::WriteContents)?;
-        file.sync_all()
-            .await
-            .map_err(CrashsafeOverwriteError::SyncTempfile)?;
+        .await?;
+        file.write_all(content).await?;
+        file.sync_all().await?;
        drop(file); // before the rename, that's important!
                    // renames are atomic
-        std::fs::rename(tmp_path, final_path)
-            .map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?;
+        std::fs::rename(tmp_path, final_path)?;
        // Only open final path parent dirfd now, so that this operation only
        // ever holds one VirtualFile fd at a time.  That's important because
        // the current `find_victim_slot` impl might pick the same slot for both
        // VirtualFile., and it eventually does a blocking write lock instead of
        // try_lock.
        let final_parent_dirfd =
-            Self::open_with_options(final_path_parent, OpenOptions::new().read(true))
-                .await
-                .map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?;
-        final_parent_dirfd
-            .sync_all()
-            .await
-            .map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?;
+            Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
+        final_parent_dirfd.sync_all().await?;
        Ok(())
    }

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -857,7 +857,8 @@ impl WalRedoProcess {
            let in_revents = stdin_pollfds[0].revents().unwrap();
            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            } else if in_revents.contains(PollFlags::POLLHUP) {
+            }
+            if in_revents.contains(PollFlags::POLLHUP) {
                // We still have more data to write, but the process closed the pipe.
                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
            }
@@ -907,7 +908,8 @@ impl WalRedoProcess {
                let out_revents = stdout_pollfds[0].revents().unwrap();
                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                } else if out_revents.contains(PollFlags::POLLHUP) {
+                }
+                if out_revents.contains(PollFlags::POLLHUP) {
                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
                }
            }
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -13,6 +13,7 @@ pub struct ConsoleError {
 #[derive(Deserialize)]
 pub struct GetRoleSecret {
    pub role_secret: Box<str>,
+    pub allowed_ips: Option<Vec<Box<str>>>,
 }

 // Manually implement debug to omit sensitive info.
@@ -187,4 +188,31 @@ mod tests {

        Ok(())
    }
+
+    #[test]
+    fn parse_wake_compute() -> anyhow::Result<()> {
+        let json = json!({
+            "address": "0.0.0.0",
+            "aux": dummy_aux(),
+        });
+        let _: WakeCompute = serde_json::from_str(&json.to_string())?;
+        Ok(())
+    }
+
+    #[test]
+    fn parse_get_role_secret() -> anyhow::Result<()> {
+        // Empty `allowed_ips` field.
+        let json = json!({
+            "role_secret": "secret",
+        });
+        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
+        // Empty `allowed_ips` field.
+        let json = json!({
+            "role_secret": "secret",
+            "allowed_ips": ["8.8.8.8"],
+        });
+        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
+
+        Ok(())
+    }
 }
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -49,7 +49,7 @@ impl Api {
                .endpoint
                .get("proxy_get_role_secret")
                .header("X-Request-ID", &request_id)
-                .header("Authorization", &self.jwt)
+                .header("Authorization", format!("Bearer {}", &self.jwt))
                .query(&[("session_id", extra.session_id)])
                .query(&[
                    ("application_name", extra.application_name),
@@ -94,7 +94,7 @@ impl Api {
                .endpoint
                .get("proxy_wake_compute")
                .header("X-Request-ID", &request_id)
-                .header("Authorization", &self.jwt)
+                .header("Authorization", format!("Bearer {}", &self.jwt))
                .query(&[("session_id", extra.session_id)])
                .query(&[
                    ("application_name", extra.application_name),
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -470,30 +470,26 @@ async fn query_to_json<T: GenericClient>(
    }
    .and_then(|s| s.parse::<i64>().ok());

-    let fields = if !rows.is_empty() {
-        rows[0]
-            .columns()
-            .iter()
-            .map(|c| {
-                json!({
-                    "name": Value::String(c.name().to_owned()),
-                    "dataTypeID": Value::Number(c.type_().oid().into()),
-                    "tableID": c.table_oid(),
-                    "columnID": c.column_id(),
-                    "dataTypeSize": c.type_size(),
-                    "dataTypeModifier": c.type_modifier(),
-                    "format": "text",
-                })
-            })
-            .collect::<Vec<_>>()
-    } else {
-        Vec::new()
-    };
+    let mut fields = vec![];
+    let mut columns = vec![];
+
+    for c in row_stream.columns() {
+        fields.push(json!({
+            "name": Value::String(c.name().to_owned()),
+            "dataTypeID": Value::Number(c.type_().oid().into()),
+            "tableID": c.table_oid(),
+            "columnID": c.column_id(),
+            "dataTypeSize": c.type_size(),
+            "dataTypeModifier": c.type_modifier(),
+            "format": "text",
+        }));
+        columns.push(client.get_type(c.type_oid()).await?);
+    }

    // convert rows to JSON
    let rows = rows
        .iter()
-        .map(|row| pg_text_row_to_json(row, raw_output, array_mode))
+        .map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode))
        .collect::<Result<Vec<_>, _>>()?;

    // resulting JSON format is based on the format of node-postgres result
@@ -514,22 +510,28 @@ async fn query_to_json<T: GenericClient>(
 //
 pub fn pg_text_row_to_json(
    row: &Row,
+    columns: &[Type],
    raw_output: bool,
    array_mode: bool,
 ) -> Result<Value, anyhow::Error> {
-    let iter = row.columns().iter().enumerate().map(|(i, column)| {
-        let name = column.name();
-        let pg_value = row.as_text(i)?;
-        let json_value = if raw_output {
-            match pg_value {
-                Some(v) => Value::String(v.to_string()),
-                None => Value::Null,
-            }
-        } else {
-            pg_text_to_json(pg_value, column.type_())?
-        };
-        Ok((name.to_string(), json_value))
-    });
+    let iter = row
+        .columns()
+        .iter()
+        .zip(columns)
+        .enumerate()
+        .map(|(i, (column, typ))| {
+            let name = column.name();
+            let pg_value = row.as_text(i)?;
+            let json_value = if raw_output {
+                match pg_value {
+                    Some(v) => Value::String(v.to_string()),
+                    None => Value::Null,
+                }
+            } else {
+                pg_text_to_json(pg_value, typ)?
+            };
+            Ok((name.to_string(), json_value))
+        });

    if array_mode {
        // drop keys and aggregate into array
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -81,7 +81,6 @@ FALLBACK_DURATION = {
    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
-    "test_runner/performance/test_startup.py::test_startup": 890.114,
    "test_runner/performance/test_startup.py::test_startup_simple": 2.51,
    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -249,7 +249,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
            # this has been seen in the wild by tests with the below contradicting logging
            # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865
            # this seems like a mock_s3 issue
-            log.warn(
+            log.warning(
                f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
            )
            keys = 0
@@ -257,7 +257,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
            # this has been seen in one case with mock_s3:
            # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
            # looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
-            log.warn(
+            log.warning(
                f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
            )

--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -1,8 +1,10 @@
 from contextlib import closing

+from fixtures.benchmark_fixture import MetricReport
 from fixtures.compare_fixtures import NeonCompare, PgCompare
 from fixtures.pageserver.utils import wait_tenant_status_404
 from fixtures.pg_version import PgVersion
+from fixtures.types import Lsn


 #
@@ -18,6 +20,8 @@ from fixtures.pg_version import PgVersion
 def test_bulk_insert(neon_with_baseline: PgCompare):
    env = neon_with_baseline

+    start_lsn = Lsn(env.pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0])
+
    with closing(env.pg.connect()) as conn:
        with conn.cursor() as cur:
            cur.execute("create table huge (i int, j int);")
@@ -31,6 +35,13 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
            env.report_peak_memory_use()
            env.report_size()

+    # Report amount of wal written. Useful for comparing vanilla wal format vs
+    # neon wal format, measuring neon write amplification, etc.
+    end_lsn = Lsn(env.pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0])
+    wal_written_bytes = end_lsn - start_lsn
+    wal_written_mb = round(wal_written_bytes / (1024 * 1024))
+    env.zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
+
    # When testing neon, also check how long it takes the pageserver to reingest the
    # wal from safekeepers. If this number is close to total runtime, then the pageserver
    # is the bottleneck.
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -1,6 +1,3 @@
-from contextlib import closing
-
-import pytest
 import requests
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.neon_fixtures import NeonEnvBuilder
@@ -81,49 +78,3 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc

        # Imitate optimizations that console would do for the second start
        endpoint.respec(skip_pg_catalog_updates=True)
-
-
-# This test sometimes runs for longer than the global 5 minute timeout.
-@pytest.mark.timeout(900)
-def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
-    neon_env_builder.num_safekeepers = 3
-    env = neon_env_builder.init_start()
-
-    # Start
-    env.neon_cli.create_branch("test_startup")
-    with zenbenchmark.record_duration("startup_time"):
-        endpoint = env.endpoints.create_start("test_startup")
-        endpoint.safe_psql("select 1;")
-
-    # Restart
-    endpoint.stop_and_destroy()
-    with zenbenchmark.record_duration("restart_time"):
-        endpoint.create_start("test_startup")
-        endpoint.safe_psql("select 1;")
-
-    # Fill up
-    num_rows = 1000000  # 30 MB
-    num_tables = 100
-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            for i in range(num_tables):
-                cur.execute(f"create table t_{i} (i integer);")
-                cur.execute(f"insert into t_{i} values (generate_series(1,{num_rows}));")
-
-    # Read
-    with zenbenchmark.record_duration("read_time"):
-        endpoint.safe_psql("select * from t_0;")
-
-    # Read again
-    with zenbenchmark.record_duration("second_read_time"):
-        endpoint.safe_psql("select * from t_0;")
-
-    # Restart
-    endpoint.stop_and_destroy()
-    with zenbenchmark.record_duration("restart_with_data"):
-        endpoint.create_start("test_startup")
-        endpoint.safe_psql("select 1;")
-
-    # Read
-    with zenbenchmark.record_duration("read_after_restart"):
-        endpoint.safe_psql("select * from t_0;")
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -432,3 +432,47 @@ def test_sql_over_http_pool_idle(static_proxy: NeonProxy):
    query(200, "BEGIN")
    pid2 = query(200, GET_CONNECTION_PID_QUERY)["rows"][0]["pid"]
    assert pid1 != pid2
+
+
+@pytest.mark.timeout(60)
+def test_sql_over_http_pool_dos(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create user http_auth with password 'http' superuser")
+
+    static_proxy.safe_psql("CREATE TYPE foo AS ENUM ('foo')")
+
+    def query(status: int, query: str) -> Any:
+        return static_proxy.http_query(
+            query,
+            [],
+            user="http_auth",
+            password="http",
+            expected_code=status,
+        )
+
+    # query generates a million rows - should hit the 10MB reponse limit quickly
+    response = query(
+        400,
+        "select * from generate_series(1, 5000) a cross join generate_series(1, 5000) b cross join (select 'foo'::foo) c;",
+    )
+    assert "response is too large (max is 10485760 bytes)" in response["message"]
+
+
+def test_sql_over_http_pool_custom_types(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create user http_auth with password 'http' superuser")
+
+    static_proxy.safe_psql("CREATE TYPE foo AS ENUM ('foo','bar','baz')")
+
+    def query(status: int, query: str) -> Any:
+        return static_proxy.http_query(
+            query,
+            [],
+            user="http_auth",
+            password="http",
+            expected_code=status,
+        )
+
+    response = query(
+        200,
+        "select array['foo'::foo, 'bar'::foo, 'baz'::foo] as data",
+    )
+    assert response["rows"][0]["data"] == ["foo", "bar", "baz"]
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -116,6 +116,8 @@ FAILPOINTS = [
    "tenant-delete-before-create-local-mark",
    "tenant-delete-before-background",
    "tenant-delete-before-polling-ongoing-deletions",
+    "tenant-delete-before-cleanup-remaining-fs-traces",
+    "tenant-delete-before-remove-timelines-dir",
    "tenant-delete-before-remove-deleted-mark",
    "tenant-delete-before-remove-tenant-dir",
    # Some failpoints from timeline deletion
@@ -127,6 +129,7 @@ FAILPOINTS = [

 FAILPOINTS_BEFORE_BACKGROUND = [
    "timeline-delete-before-schedule",
+    "tenant-delete-before-shutdown",
    "tenant-delete-before-create-remote-mark",
    "tenant-delete-before-create-local-mark",
    "tenant-delete-before-background",
@@ -240,7 +243,10 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
    if check is Check.RETRY_WITH_RESTART:
        env.pageserver.restart()

-        if failpoint in ("tenant-delete-before-create-remote-mark",):
+        if failpoint in (
+            "tenant-delete-before-shutdown",
+            "tenant-delete-before-create-remote-mark",
+        ):
            wait_until_tenant_active(
                ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25
            )