diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 03387d00fe..af31fda06b 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1274,8 +1274,29 @@ impl Tenant { timeline }; - info!("waiting for layer_removal_cs.lock()"); + // Now that the Timeline is in Stopping state, request all the related tasks to + // shut down. + // + // NB: If you call delete_timeline multiple times concurrently, they will + // all go through the motions here. Make sure the code here is idempotent, + // and don't error out if some of the shutdown tasks have already been + // completed! + + // Stop the walreceiver first. + debug!("waiting for wal receiver to shutdown"); + task_mgr::shutdown_tasks( + Some(TaskKind::WalReceiverManager), + Some(self.tenant_id), + Some(timeline_id), + ) + .await; + debug!("wal receiver shutdown confirmed"); + + info!("waiting for timeline tasks to shutdown"); + task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await; + // No timeout here, GC & Compaction should be responsive to the `TimelineState::Stopping` change. + info!("waiting for layer_removal_cs.lock()"); let layer_removal_guard = timeline.layer_removal_cs.lock().await; info!("got layer_removal_cs.lock(), deleting layer files"); diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 85be420cb8..e4e9d0c6e8 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -262,27 +262,6 @@ pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Resul } pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> { - // Start with the shutdown of timeline tasks (this shuts down the walreceiver) - // It is important that we do not take locks here, and do not check whether the timeline exists - // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join - // we cannot create new timelines and tenants, and that can take quite some time, - // it can even become stuck due to a bug making whole pageserver unavailable for some operations - // so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation - // and then try to actually remove timeline from inmemory state and this is the point when concurrent requests - // will synchronize and either fail with the not found error or succeed - - debug!("waiting for wal receiver to shutdown"); - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(tenant_id), - Some(timeline_id), - ) - .await; - debug!("wal receiver shutdown confirmed"); - - info!("waiting for timeline tasks to shutdown"); - task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await; - info!("timeline task shutdown completed"); match get_tenant(tenant_id, true).await { Ok(tenant) => { tenant.delete_timeline(timeline_id).await?;