From c178f34e21d7822d1545026a68f68ccb7c3f2e82 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 27 Oct 2023 15:53:39 +0100
Subject: [PATCH] pageserver: start simplifying tenant deletion

---
 pageserver/src/http/routes.rs             |   3 +-
 pageserver/src/task_mgr.rs                |   3 +
 pageserver/src/tenant.rs                  | 103 ++--
 pageserver/src/tenant/delete.rs           | 549 +++++++---------------
 pageserver/src/tenant/mgr.rs              | 137 ++++--
 pageserver/src/tenant/timeline/delete.rs  |   2 +-
 test_runner/regress/test_tenant_delete.py |   8 +-
 7 files changed, 311 insertions(+), 494 deletions(-)
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 988b148a1f..daa724decc 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -275,7 +275,6 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
         use crate::tenant::delete::DeleteTenantError::*;
         match value {
             Get(g) => ApiError::from(g),
-            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
             Timeline(t) => ApiError::from(t),
             NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
             SlotError(e) => e.into(),
@@ -782,7 +781,7 @@ async fn tenant_delete_handler(
 
     let state = get_state(&request);
 
-    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
+    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id, false)
         .instrument(info_span!("tenant_delete_handler", %tenant_id))
         .await?;
 
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 017322ffb2..2216af7544 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -291,6 +291,9 @@ pub enum TaskKind {
     // A request that comes in via the pageserver HTTP API.
     MgmtRequest,
 
+    // Tenant Manager servicing a channel that enables upcalls from Tenant
+    TenantManagerUpcall,
+
     DebugTool,
 
     #[cfg(test)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8f3d979e71..99b64ea376 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -51,10 +51,9 @@ use self::config::AttachedLocationConfig;
 use self::config::AttachmentMode;
 use self::config::LocationConf;
 use self::config::TenantConf;
-use self::delete::DeleteTenantFlow;
+use self::delete::should_resume_deletion;
 use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
-use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
@@ -250,8 +249,6 @@ pub struct Tenant {
     cached_synthetic_tenant_size: Arc<AtomicU64>,
 
     eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
-
-    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
 }
 
 impl std::fmt::Debug for Tenant {
@@ -537,8 +534,8 @@ impl Tenant {
         resources: TenantSharedResources,
         attached_conf: AttachedTenantConf,
         init_order: Option<InitializationOrder>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
         mode: SpawnMode,
+        resume_deletion_upcall: Option<tokio::sync::mpsc::Sender<Arc<Tenant>>>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Tenant>> {
         let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
@@ -625,67 +622,48 @@ impl Tenant {
                 // Remote preload is complete.
                 drop(remote_load_completion);
 
-                let pending_deletion = {
-                    match DeleteTenantFlow::should_resume_deletion(
-                        conf,
-                        preload.as_ref().map(|p| p.deleting).unwrap_or(false),
-                        &tenant_clone,
-                    )
-                    .await
-                    {
-                        Ok(should_resume_deletion) => should_resume_deletion,
-                        Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err));
-                            return Ok(());
+                let resume_deletion = should_resume_deletion(conf, preload.as_ref().map(|p| p.deleting).unwrap_or(false), &tenant_id).await?;
+                if resume_deletion {
+                    // We will wait until the background 
+                    info!("Tenant is partially deleted, will resume");
+
+                    // Put the Tenant into a Stopping state so that it will not try to serve any I/O
+                    tenant_clone
+                        .set_stopping(false, true)
+                        .await
+                        .expect("cant be stopping or broken");
+
+                    // Proceed with attaching the tenant to load the metadata we will use for remote deletion
+                    match tenant_clone.attach(init_order, preload, &ctx).await {
+                        Ok(()) => {
+                            info!("attach finished, deletion will resume");
+                        }
+                        Err(e) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(e));
+                        }
+                    }
+
+                    if let Some(resume_deletion_upcall) = resume_deletion_upcall {
+                        // If this send() fails it means we're shutting down, so just drop it on the floor
+                        resume_deletion_upcall.send(tenant_clone).await.ok();
+                    } else {
+                        make_broken(&tenant_clone, anyhow::anyhow!(
+                            "Attemped to attach a partially deleted tenant outside of pageserver startup"
+                        ));
+                    }
+                } else {
+                    // Normal case: load all the metadata for the tenant.
+                    match tenant_clone.attach(init_order, preload, &ctx).await {
+                        Ok(()) => {
+                            info!("attach finished, activating");
+                            tenant_clone.activate(broker_client, None, &ctx);
+                        }
+                        Err(e) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(e));
                         }
                     }
                 };
 
-                info!("pending_deletion {}", pending_deletion.is_some());
-
-                if let Some(deletion) = pending_deletion {
-                    // as we are no longer loading, signal completion by dropping
-                    // the completion while we resume deletion
-                    drop(_completion);
-                    // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
-                    let _ = init_order
-                        .as_mut()
-                        .and_then(|x| x.initial_logical_size_attempt.take());
-                    let background_jobs_can_start =
-                        init_order.as_ref().map(|x| &x.background_jobs_can_start);
-                    if let Some(background) = background_jobs_can_start {
-                        info!("waiting for backgound jobs barrier");
-                        background.clone().wait().await;
-                        info!("ready for backgound jobs barrier");
-                    }
-
-                    match DeleteTenantFlow::resume_from_attach(
-                        deletion,
-                        &tenant_clone,
-                        preload,
-                        tenants,
-                        init_order,
-                        &ctx,
-                    )
-                    .await
-                    {
-                        Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err));
-                            return Ok(());
-                        }
-                        Ok(()) => return Ok(()),
-                    }
-                }
-
-                match tenant_clone.attach(init_order, preload, &ctx).await {
-                    Ok(()) => {
-                        info!("attach finished, activating");
-                        tenant_clone.activate(broker_client, None, &ctx);
-                    }
-                    Err(e) => {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e));
-                    }
-                }
                 Ok(())
             }
             .instrument({
@@ -2362,7 +2340,6 @@ impl Tenant {
             cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
             cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
             eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
-            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
         }
     }
 
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index fa63c83c17..a9ca33c26f 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -1,31 +1,27 @@
 use std::sync::Arc;
 
 use anyhow::Context;
-use camino::{Utf8Path, Utf8PathBuf};
+use camino::Utf8Path;
 use pageserver_api::models::TenantState;
 use remote_storage::{GenericRemoteStorage, RemotePath};
-use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, instrument, warn, Instrument, Span};
+use tracing::error;
 
 use utils::{
-    backoff, crashsafe, fs_ext,
+    backoff, crashsafe,
     id::{TenantId, TimelineId},
 };
 
 use crate::{
     config::PageServerConf,
-    context::RequestContext,
-    task_mgr::{self, TaskKind},
-    InitializationOrder,
+    tenant::{mgr::safe_rename_tenant_dir, ShutdownError},
 };
 
 use super::{
-    mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
+    mgr::{GetTenantError, SlotGuard, TenantSlotError, TenantSlotUpsertError},
     remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
-    span,
     timeline::delete::DeleteTimelineFlow,
-    tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
+    tree_sort_timelines, DeleteTimelineError, Tenant,
 };
 
 #[derive(Debug, thiserror::Error)]
@@ -39,9 +35,6 @@ pub(crate) enum DeleteTenantError {
     #[error("Invalid state {0}. Expected Active or Broken")]
     InvalidState(TenantState),
 
-    #[error("Tenant deletion is already in progress")]
-    AlreadyInProgress,
-
     #[error("Tenant map slot error {0}")]
     SlotError(#[from] TenantSlotError),
 
@@ -55,7 +48,142 @@ pub(crate) enum DeleteTenantError {
     Other(#[from] anyhow::Error),
 }
 
-type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
+/// The part of tenant deletion that must happen before acknowledging the deletion request
+///
+/// Usually deletion requires that the tenant is not already in Stopping state: set
+/// `resume` to true to skip this check if resuming deletion at startup, since we
+/// would have loaded the tenant into Stopping mode already
+pub(crate) async fn delete_tenant_foreground(
+    conf: &'static PageServerConf,
+    remote_storage: Option<GenericRemoteStorage>,
+    tenant: &Arc<Tenant>,
+    resume: bool,
+    // Witness that we are doing this work within a TenantSlot::InProgress state.
+    _slot_guard: &SlotGuard,
+) -> Result<(), DeleteTenantError> {
+    // In resume mode, we must be stopping.  Else, we must _not_ be Stopping.
+    if !(matches!(tenant.current_state(), TenantState::Stopping) ^ !resume) {
+        return Err(DeleteTenantError::InvalidState(tenant.current_state()));
+    }
+
+    fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-create-remote-mark"
+        ))?
+    });
+
+    // Write persistent tombstone
+    if let Some(remote_storage) = &remote_storage {
+        create_remote_delete_mark(conf, remote_storage, &tenant.get_tenant_id()).await?;
+    }
+
+    fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-create-local-mark"
+        ))?
+    });
+
+    create_local_delete_mark(conf, &tenant.tenant_id)
+        .await
+        .context("local create mark")?;
+
+    fail::fail_point!("tenant-delete-before-background", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-background"
+        ))?
+    });
+
+    Ok(())
+}
+
+/// The part of tenant deletion that happens after we have already acknowledged the request.
+///
+/// This part may retried after pageserver restart if we see a tenant that has a deletion marker
+/// but has not been completely deleted.
+pub(crate) async fn delete_tenant_background(
+    conf: &'static PageServerConf,
+    remote_storage: Option<GenericRemoteStorage>,
+    tenant_id: TenantId,
+    tenant: &Arc<Tenant>,
+    resume: bool,
+    // Witness that we are doing this work within a TenantSlot::InProgress state.
+    _slot_guard: &SlotGuard,
+) -> anyhow::Result<()> {
+    let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
+        .await
+        .context("schedule_ordered_timeline_deletions")?;
+
+    fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-polling-ongoing-deletions"
+        ))?
+    });
+
+    // Wait for deletions that were already running at the moment when tenant deletion was requested.
+    // When we can lock deletion guard it means that corresponding timeline deletion finished.
+    for (guard, timeline_id) in already_running_timeline_deletions {
+        let flow = guard.lock().await;
+        if !flow.is_finished() {
+            return Err(DeleteTenantError::Other(anyhow::anyhow!(
+                "already running timeline deletion failed: {timeline_id}"
+            ))
+            .into());
+        }
+    }
+
+    // For convenience, use the Tenant's deletion queue reference so that we don't have
+    // to take it as an  argument.
+    let deletion_queue_client = tenant.deletion_queue_client.clone();
+
+    fail::fail_point!("tenant-delete-before-shutdown", |_| {
+        Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
+    });
+
+    // Tear down local runtime state
+    if !resume {
+        tenant.shutdown(false).await.map_err(|e| match e {
+            ShutdownError::AlreadyStopping => {
+                DeleteTenantError::InvalidState(TenantState::Stopping)
+            }
+        })?;
+    }
+
+    // Not necessary for correctness, but executing deletions before erasing local contents
+    // means that we will have a better chance to resume the delete if we crash.
+    deletion_queue_client.flush_execute().await?;
+
+    fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-deleted-mark"
+        ))?
+    });
+
+    // Remove the deletion marker
+    remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant_id).await?;
+
+    // Not necessary for correctness, but helps make it true that when we log that deletion is
+    // done, it really is.
+    deletion_queue_client.flush_execute().await?;
+
+    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-tenant-dir"
+        ))?
+    });
+
+    // Remove local storage contents.  We do this last, so that if we crash during delete, on
+    // restart we will attempt to re-attach the tenant and resume the deletion.
+    let local_tenant_directory = conf.tenant_path(&tenant_id);
+    let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
+        .await
+        .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
+
+    tokio::fs::remove_dir_all(tmp_path.as_path())
+        .await
+        .with_context(|| format!("tenant directory {:?} deletion", tmp_path))?;
+
+    Ok(())
+}
 
 fn remote_tenant_delete_mark_path(
     conf: &PageServerConf,
@@ -70,7 +198,7 @@ fn remote_tenant_delete_mark_path(
     Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
 }
 
-async fn create_remote_delete_mark(
+pub(crate) async fn create_remote_delete_mark(
     conf: &PageServerConf,
     remote_storage: &GenericRemoteStorage,
     tenant_id: &TenantId,
@@ -115,7 +243,26 @@ async fn create_local_delete_mark(
     Ok(())
 }
 
-async fn schedule_ordered_timeline_deletions(
+pub(crate) async fn should_resume_deletion(
+    conf: &'static PageServerConf,
+    remote_mark_exists: bool,
+    tenant_id: &TenantId,
+) -> Result<bool, DeleteTenantError> {
+    if remote_mark_exists {
+        return Ok(true);
+    }
+
+    // In the very last stage of deletion, we might have already removed the remote
+    // marker, but be attaching the tenant anyway on the basis of its local directory
+    // existing: to resume deletion in this case we need the local deletion marker.
+    if conf.tenant_deleted_mark_file_path(tenant_id).exists() {
+        Ok(true)
+    } else {
+        Ok(false)
+    }
+}
+
+pub(crate) async fn schedule_ordered_timeline_deletions(
     tenant: &Arc<Tenant>,
 ) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
     // Tenant is stopping at this point. We know it will be deleted.
@@ -153,21 +300,7 @@ async fn schedule_ordered_timeline_deletions(
     Ok(already_running_deletions)
 }
 
-async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> {
-    // Assert timelines dir is empty.
-    if !fs_ext::is_directory_empty(timelines_path).await? {
-        // Display first 10 items in directory
-        let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
-        let list = &list.into_iter().take(10).collect::<Vec<_>>();
-        return Err(DeleteTenantError::Other(anyhow::anyhow!(
-            "Timelines directory is not empty after all timelines deletion: {list:?}"
-        )));
-    }
-
-    Ok(())
-}
-
-async fn remove_tenant_remote_delete_mark(
+pub(crate) async fn remove_tenant_remote_delete_mark(
     conf: &PageServerConf,
     remote_storage: Option<&GenericRemoteStorage>,
     tenant_id: &TenantId,
@@ -188,357 +321,3 @@ async fn remove_tenant_remote_delete_mark(
     }
     Ok(())
 }
-
-// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
-async fn cleanup_remaining_fs_traces(
-    conf: &PageServerConf,
-    tenant_id: &TenantId,
-) -> Result<(), DeleteTenantError> {
-    let rm = |p: Utf8PathBuf, is_dir: bool| async move {
-        if is_dir {
-            tokio::fs::remove_dir(&p).await
-        } else {
-            tokio::fs::remove_file(&p).await
-        }
-        .or_else(fs_ext::ignore_not_found)
-        .with_context(|| format!("failed to delete {p}"))
-    };
-
-    rm(conf.tenant_config_path(tenant_id), false).await?;
-    rm(conf.tenant_location_config_path(tenant_id), false).await?;
-
-    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-timelines-dir"
-        ))?
-    });
-
-    rm(conf.timelines_path(tenant_id), true).await?;
-
-    fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-deleted-mark"
-        ))?
-    });
-
-    // Make sure previous deletions are ordered before mark removal.
-    // Otherwise there is no guarantee that they reach the disk before mark deletion.
-    // So its possible for mark to reach disk first and for other deletions
-    // to be reordered later and thus missed if a crash occurs.
-    // Note that we dont need to sync after mark file is removed
-    // because we can tolerate the case when mark file reappears on startup.
-    let tenant_path = &conf.tenant_path(tenant_id);
-    if tenant_path.exists() {
-        crashsafe::fsync_async(&conf.tenant_path(tenant_id))
-            .await
-            .context("fsync_pre_mark_remove")?;
-    }
-
-    rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
-
-    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-tenant-dir"
-        ))?
-    });
-
-    rm(conf.tenant_path(tenant_id), true).await?;
-
-    Ok(())
-}
-
-/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
-/// and deletes its data from both disk and s3.
-/// The sequence of steps:
-/// 1. Upload remote deletion mark.
-/// 2. Create local mark file.
-/// 3. Shutdown tasks
-/// 4. Run ordered timeline deletions
-/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
-/// 6. Remove remote mark
-/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
-/// It is resumable from any step in case a crash/restart occurs.
-/// There are two entrypoints to the process:
-/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
-/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
-///  Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
-#[derive(Default)]
-pub enum DeleteTenantFlow {
-    #[default]
-    NotStarted,
-    InProgress,
-    Finished,
-}
-
-impl DeleteTenantFlow {
-    // These steps are run in the context of management api request handler.
-    // Long running steps are continued to run in the background.
-    // NB: If this fails half-way through, and is retried, the retry will go through
-    // all the same steps again. Make sure the code here is idempotent, and don't
-    // error out if some of the shutdown tasks have already been completed!
-    // NOTE: static needed for background part.
-    // We assume that calling code sets up the span with tenant_id.
-    #[instrument(skip_all)]
-    pub(crate) async fn run(
-        conf: &'static PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
-        tenant: Arc<Tenant>,
-    ) -> Result<(), DeleteTenantError> {
-        span::debug_assert_current_span_has_tenant_id();
-
-        let mut guard = Self::prepare(&tenant).await?;
-
-        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
-            tenant.set_broken(format!("{e:#}")).await;
-            return Err(e);
-        }
-
-        Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
-
-        Ok(())
-    }
-
-    // Helper function needed to be able to match once on returned error and transition tenant into broken state.
-    // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
-    // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
-    // So the solution is to set tenant state to broken.
-    async fn run_inner(
-        guard: &mut OwnedMutexGuard<Self>,
-        conf: &'static PageServerConf,
-        remote_storage: Option<&GenericRemoteStorage>,
-        tenant: &Tenant,
-    ) -> Result<(), DeleteTenantError> {
-        guard.mark_in_progress()?;
-
-        fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-create-remote-mark"
-            ))?
-        });
-
-        // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
-        // Though sounds scary, different mark name?
-        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
-        if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
-                .await
-                .context("remote_mark")?
-        }
-
-        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-create-local-mark"
-            ))?
-        });
-
-        create_local_delete_mark(conf, &tenant.tenant_id)
-            .await
-            .context("local delete mark")?;
-
-        fail::fail_point!("tenant-delete-before-background", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-background"
-            ))?
-        });
-
-        Ok(())
-    }
-
-    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
-        match self {
-            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
-            Self::InProgress { .. } => { /* We're in a retry */ }
-            Self::NotStarted => { /* Fresh start */ }
-        }
-
-        *self = Self::InProgress;
-
-        Ok(())
-    }
-
-    pub(crate) async fn should_resume_deletion(
-        conf: &'static PageServerConf,
-        remote_mark_exists: bool,
-        tenant: &Tenant,
-    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
-        let acquire = |t: &Tenant| {
-            Some(
-                Arc::clone(&t.delete_progress)
-                    .try_lock_owned()
-                    .expect("we're the only owner during init"),
-            )
-        };
-
-        if remote_mark_exists {
-            return Ok(acquire(tenant));
-        }
-
-        let tenant_id = tenant.tenant_id;
-        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
-        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
-            Ok(acquire(tenant))
-        } else {
-            Ok(None)
-        }
-    }
-
-    pub(crate) async fn resume_from_attach(
-        guard: DeletionGuard,
-        tenant: &Arc<Tenant>,
-        preload: Option<TenantPreload>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
-        init_order: Option<InitializationOrder>,
-        ctx: &RequestContext,
-    ) -> Result<(), DeleteTenantError> {
-        tenant
-            .set_stopping(false, true)
-            .await
-            .expect("cant be stopping or broken");
-
-        tenant
-            .attach(init_order, preload, ctx)
-            .await
-            .context("attach")?;
-
-        Self::background(
-            guard,
-            tenant.conf,
-            tenant.remote_storage.clone(),
-            tenants,
-            tenant,
-        )
-        .await
-    }
-
-    async fn prepare(
-        tenant: &Arc<Tenant>,
-    ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
-        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
-        // so at least for now allow deletions only for active tenants. TODO recheck
-        // Broken and Stopping is needed for retries.
-        if !matches!(
-            tenant.current_state(),
-            TenantState::Active | TenantState::Broken { .. }
-        ) {
-            return Err(DeleteTenantError::InvalidState(tenant.current_state()));
-        }
-
-        let guard = Arc::clone(&tenant.delete_progress)
-            .try_lock_owned()
-            .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
-
-        fail::fail_point!("tenant-delete-before-shutdown", |_| {
-            Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
-        });
-
-        // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
-        // i e it is an error to do:
-        // tenant.set_stopping
-        // tenant.shutdown
-        // Its also bad that we're holding tenants.read here.
-        // TODO relax set_stopping to be idempotent?
-        if tenant.shutdown(false).await.is_err() {
-            return Err(DeleteTenantError::Other(anyhow::anyhow!(
-                "tenant shutdown is already in progress"
-            )));
-        }
-
-        Ok(guard)
-    }
-
-    fn schedule_background(
-        guard: OwnedMutexGuard<Self>,
-        conf: &'static PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
-        tenant: Arc<Tenant>,
-    ) {
-        let tenant_id = tenant.tenant_id;
-
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::TimelineDeletionWorker,
-            Some(tenant_id),
-            None,
-            "tenant_delete",
-            false,
-            async move {
-                if let Err(err) =
-                    Self::background(guard, conf, remote_storage, tenants, &tenant).await
-                {
-                    error!("Error: {err:#}");
-                    tenant.set_broken(format!("{err:#}")).await;
-                };
-                Ok(())
-            }
-            .instrument({
-                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
-                span.follows_from(Span::current());
-                span
-            }),
-        );
-    }
-
-    async fn background(
-        mut guard: OwnedMutexGuard<Self>,
-        conf: &PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
-        tenant: &Arc<Tenant>,
-    ) -> Result<(), DeleteTenantError> {
-        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
-        // Note that if deletion fails we dont mark timelines as broken,
-        // the whole tenant will become broken as by `Self::schedule_background` logic
-        let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
-            .await
-            .context("schedule_ordered_timeline_deletions")?;
-
-        fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-polling-ongoing-deletions"
-            ))?
-        });
-
-        // Wait for deletions that were already running at the moment when tenant deletion was requested.
-        // When we can lock deletion guard it means that corresponding timeline deletion finished.
-        for (guard, timeline_id) in already_running_timeline_deletions {
-            let flow = guard.lock().await;
-            if !flow.is_finished() {
-                return Err(DeleteTenantError::Other(anyhow::anyhow!(
-                    "already running timeline deletion failed: {timeline_id}"
-                )));
-            }
-        }
-
-        let timelines_path = conf.timelines_path(&tenant.tenant_id);
-        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
-        if timelines_path.exists() {
-            // sanity check to guard against layout changes
-            ensure_timelines_dir_empty(&timelines_path)
-                .await
-                .context("timelines dir not empty")?;
-        }
-
-        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
-
-        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
-            ))?
-        });
-
-        cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
-            .await
-            .context("cleanup_remaining_fs_traces")?;
-
-        let mut locked = tenants.write().unwrap();
-        if locked.remove(&tenant.tenant_id).is_none() {
-            warn!("Tenant got removed from tenants map during deletion");
-        };
-
-        *guard = Self::Finished;
-
-        Ok(())
-    }
-}
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 318f35075c..914098a1cf 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -26,7 +26,6 @@ use crate::control_plane_client::{
 use crate::deletion_queue::DeletionQueueClient;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
-use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::{
     create_tenant_files, AttachedTenantConf, ShutdownError, SpawnMode, Tenant, TenantState,
 };
@@ -37,7 +36,7 @@ use utils::fs_ext::PathExt;
 use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
 
-use super::delete::DeleteTenantError;
+use super::delete::{delete_tenant_background, delete_tenant_foreground, DeleteTenantError};
 use super::timeline::delete::DeleteTimelineFlow;
 use super::TenantSharedResources;
 
@@ -104,13 +103,6 @@ impl TenantsMap {
             }
         }
     }
-
-    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<TenantSlot> {
-        match self {
-            TenantsMap::Initializing => None,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
-        }
-    }
 }
 
 /// This is "safe" in that that it won't leave behind a partially deleted directory
@@ -119,12 +111,14 @@ impl TenantsMap {
 ///
 /// This is pageserver-specific, as it relies on future processes after a crash to check
 /// for TEMP_FILE_SUFFIX when loading things.
-async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
+pub(crate) async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
     let tmp_path = safe_rename_tenant_dir(path).await?;
     fs::remove_dir_all(tmp_path).await
 }
 
-async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
+pub(crate) async fn safe_rename_tenant_dir(
+    path: impl AsRef<Utf8Path>,
+) -> std::io::Result<Utf8PathBuf> {
     let parent = path
         .as_ref()
         .parent()
@@ -370,6 +364,10 @@ pub async fn init_tenant_mgr(
     let tenant_generations =
         init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
 
+    // Tenants may send into this channel if they discover that they are in a partially
+    // deleted state.
+    let (deletion_upcall_tx, deletion_upcall_rx) = tokio::sync::mpsc::channel::<Arc<Tenant>>(128);
+
     // Construct `Tenant` objects and start them running
     for (tenant_id, location_conf) in tenant_configs {
         let tenant_dir_path = conf.tenant_path(&tenant_id);
@@ -441,8 +439,8 @@ pub async fn init_tenant_mgr(
             resources.clone(),
             AttachedTenantConf::try_from(location_conf)?,
             Some(init_order.clone()),
-            &TENANTS,
             SpawnMode::Normal,
+            Some(deletion_upcall_tx.clone()),
             &ctx,
         ) {
             Ok(tenant) => {
@@ -459,9 +457,41 @@ pub async fn init_tenant_mgr(
     let mut tenants_map = TENANTS.write().unwrap();
     assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
     *tenants_map = TenantsMap::Open(tenants);
+
+    spawn_handle_upcalls_task(conf, &resources.remote_storage, deletion_upcall_rx);
+
     Ok(())
 }
 
+/// For some edge cases, like resuming their own deletion on attach, tenants
+/// may submit mgr-level operations into a queue.
+fn spawn_handle_upcalls_task(
+    conf: &'static PageServerConf,
+    remote_storage: &Option<GenericRemoteStorage>,
+    mut deletion_upcall_rx: tokio::sync::mpsc::Receiver<Arc<Tenant>>,
+) {
+    let remote_storage = remote_storage.clone();
+    task_mgr::spawn(
+        task_mgr::BACKGROUND_RUNTIME.handle(),
+        TaskKind::TenantManagerUpcall,
+        None,
+        None,
+        "tenant manager upcall",
+        false,
+        async move {
+            while let Some(tenant) = deletion_upcall_rx.recv().await {
+                if let Err(e) =
+                    delete_tenant(conf, remote_storage.clone(), tenant.tenant_id, true).await
+                {
+                    error!(tenant_id = %tenant.get_tenant_id(), "Failed to complete deletion of tenant: {e}");
+                }
+            }
+
+            Ok(())
+        },
+    );
+}
+
 /// Wrapper for Tenant::spawn that checks invariants before running, and inserts
 /// a broken tenant in the map if Tenant::spawn fails.
 #[allow(clippy::too_many_arguments)]
@@ -472,8 +502,8 @@ pub(crate) fn tenant_spawn(
     resources: TenantSharedResources,
     location_conf: AttachedTenantConf,
     init_order: Option<InitializationOrder>,
-    tenants: &'static std::sync::RwLock<TenantsMap>,
     mode: SpawnMode,
+    resume_deletion_upcall: Option<tokio::sync::mpsc::Sender<Arc<Tenant>>>,
     ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
     anyhow::ensure!(
@@ -504,8 +534,8 @@ pub(crate) fn tenant_spawn(
         resources,
         location_conf,
         init_order,
-        tenants,
         mode,
+        resume_deletion_upcall,
         ctx,
     ) {
         Ok(tenant) => tenant,
@@ -601,8 +631,6 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                 if let Err(e) = res {
                     match e {
                         ShutdownError::AlreadyStopping => {
-                            // TODO: to ensure this can _never_ happen, we need to get rid of
-                            // the horrible DeleteTenantFlow::should_resume_deletion
                             tracing::warn!(%tenant_id, "Tenant already stopping during shutdown");
                         }
                     }
@@ -684,8 +712,8 @@ pub(crate) async fn create_tenant(
         resources,
         AttachedTenantConf::try_from(location_conf)?,
         None,
-        &TENANTS,
         SpawnMode::Create,
+        None,
         ctx,
     )?;
     // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
@@ -849,8 +877,8 @@ pub(crate) async fn upsert_location(
                 },
                 AttachedTenantConf::try_from(new_location_config)?,
                 None,
-                &TENANTS,
                 SpawnMode::Normal,
+                None,
                 ctx,
             )?;
 
@@ -915,18 +943,11 @@ pub(crate) async fn delete_tenant(
     conf: &'static PageServerConf,
     remote_storage: Option<GenericRemoteStorage>,
     tenant_id: TenantId,
+    resume: bool,
 ) -> Result<(), DeleteTenantError> {
     // We acquire a SlotGuard during this function to protect against concurrent
-    // changes while the ::prepare phase of DeleteTenantFlow executes, but then
-    // have to return the Tenant to the map while the background deletion runs.
-    //
-    // TODO: refactor deletion to happen outside the lifetime of a Tenant.
-    // Currently, deletion requires a reference to the tenants map in order to
-    // keep the Tenant in the map until deletion is complete, and then remove
-    // it at the end.
-    //
-    // See https://github.com/neondatabase/neon/issues/5080
-
+    // operations, but also to ensure we do not permit re-creation of the same
+    // tenant ID until we are done with the deletion.
     let mut slot_guard = tenant_map_acquire_slot(&tenant_id, Some(true))?;
 
     // unwrap is safe because we used expect_exist=true when acquiring the slot
@@ -940,11 +961,51 @@ pub(crate) async fn delete_tenant(
         }
     };
 
-    let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await;
+    // Do the foreground part of deletion: if it fails, set the tenant broken
+    // and leave it in the tenants map
+    if let Err(e) =
+        delete_tenant_foreground(conf, remote_storage.clone(), &tenant, resume, &slot_guard).await
+    {
+        tenant.set_broken(format!("{e:#}")).await;
+        slot_guard.upsert(TenantSlot::Attached(tenant))?;
+        return Err(e);
+    }
 
-    // Replace our InProgress marker with the Tenant in attached state, after the prepare phase of deletion is done
-    slot_guard.upsert(slot)?;
-    result
+    // Do the actual deletion in the background, while holding the SlotGuard
+    task_mgr::spawn(
+        task_mgr::BACKGROUND_RUNTIME.handle(),
+        TaskKind::MgmtRequest,
+        None, // This task runs without the tenant_id, as it should not be caught by Tenant::shutdown
+        None,
+        "delete_tenant_background",
+        false,
+        async move {
+            match delete_tenant_background(
+                conf,
+                remote_storage,
+                tenant_id,
+                &tenant,
+                resume,
+                &slot_guard,
+            )
+            .instrument(info_span!("delete_tenant_background", %tenant_id))
+            .await
+            {
+                Ok(()) => {
+                    info!(%tenant_id, "Tenant deletion complete");
+                }
+                Err(e) => {
+                    warn!(% tenant_id, "Tenant deletion failed: {e:#}");
+                    tenant.set_broken(format!("{e:#}")).await;
+                    slot_guard.upsert(TenantSlot::Attached(tenant))?;
+                }
+            }
+
+            Ok(())
+        },
+    );
+
+    Ok(())
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -1093,8 +1154,8 @@ pub(crate) async fn load_tenant(
         resources,
         AttachedTenantConf::try_from(location_conf)?,
         None,
-        &TENANTS,
         SpawnMode::Normal,
+        None,
         ctx,
     )
     .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
@@ -1172,15 +1233,17 @@ pub(crate) async fn attach_tenant(
     // TODO: tenant directory remains on disk if we bail out from here on.
     //       See https://github.com/neondatabase/neon/issues/4233
 
+    let (deletion_upcall_tx, deletion_upcall_rx) = tokio::sync::mpsc::channel::<Arc<Tenant>>(128);
+
     let attached_tenant = tenant_spawn(
         conf,
         tenant_id,
         &tenant_dir,
-        resources,
+        resources.clone(),
         AttachedTenantConf::try_from(location_conf)?,
         None,
-        &TENANTS,
         SpawnMode::Normal,
+        Some(deletion_upcall_tx),
         ctx,
     )?;
     // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
@@ -1193,6 +1256,8 @@ pub(crate) async fn attach_tenant(
         )));
     }
 
+    spawn_handle_upcalls_task(conf, &resources.remote_storage, deletion_upcall_rx);
+
     tenant_guard.upsert(TenantSlot::Attached(attached_tenant))?;
     Ok(())
 }
@@ -1260,7 +1325,7 @@ pub enum TenantMapError {
 /// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
 /// for this tenant, which acts as a marker for any operations targeting
 /// this tenant to retry later, or wait for the InProgress state to end.
-pub struct SlotGuard {
+pub(crate) struct SlotGuard {
     tenant_id: TenantId,
     old_value: Option<TenantSlot>,
     upserted: bool,
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 6d30664515..f3201d4739 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -410,7 +410,7 @@ impl DeleteTimelineFlow {
     }
 
     /// Shortcut to create Timeline in stopping state and spawn deletion task.
-    /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
+    /// See corresponding parts of [`crate::tenant::timeline::delete::DeleteTimelineFlow`]
     #[instrument(skip_all, fields(%timeline_id))]
     pub async fn resume_deletion(
         tenant: Arc<Tenant>,
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index ae77197088..2ed30dbf97 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -116,8 +116,6 @@ FAILPOINTS = [
     "tenant-delete-before-create-local-mark",
     "tenant-delete-before-background",
     "tenant-delete-before-polling-ongoing-deletions",
-    "tenant-delete-before-cleanup-remaining-fs-traces",
-    "tenant-delete-before-remove-timelines-dir",
     "tenant-delete-before-remove-deleted-mark",
     "tenant-delete-before-remove-tenant-dir",
     # Some failpoints from timeline deletion
@@ -129,7 +127,6 @@ FAILPOINTS = [
 
 FAILPOINTS_BEFORE_BACKGROUND = [
     "timeline-delete-before-schedule",
-    "tenant-delete-before-shutdown",
     "tenant-delete-before-create-remote-mark",
     "tenant-delete-before-create-local-mark",
     "tenant-delete-before-background",
@@ -243,10 +240,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
     if check is Check.RETRY_WITH_RESTART:
         env.pageserver.restart()
 
-        if failpoint in (
-            "tenant-delete-before-shutdown",
-            "tenant-delete-before-create-remote-mark",
-        ):
+        if failpoint in ("tenant-delete-before-create-remote-mark",):
             wait_until_tenant_active(
                 ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25
             )