From 9a049aa846aeef3e5d1e3be70b670c6697ba8e35 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 20 Dec 2022 01:42:54 +0200 Subject: [PATCH] Move code from tenant_mgr::delete_timeline to Tenant::delete_timeline. It's better to request the tasks to shut down only after setting the timeline state to Stopping. Otherwise, it's possible that a new task spawns after we have waited for the existing tasks to shut down, but before we have changed the state. We would fail to wait for them. Feels nicer from a readability point of view too. --- pageserver/src/tenant.rs | 23 ++++++++++++++++++++++- pageserver/src/tenant_mgr.rs | 21 --------------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 03387d00fe..af31fda06b 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1274,8 +1274,29 @@ impl Tenant { timeline }; - info!("waiting for layer_removal_cs.lock()"); + // Now that the Timeline is in Stopping state, request all the related tasks to + // shut down. + // + // NB: If you call delete_timeline multiple times concurrently, they will + // all go through the motions here. Make sure the code here is idempotent, + // and don't error out if some of the shutdown tasks have already been + // completed! + + // Stop the walreceiver first. + debug!("waiting for wal receiver to shutdown"); + task_mgr::shutdown_tasks( + Some(TaskKind::WalReceiverManager), + Some(self.tenant_id), + Some(timeline_id), + ) + .await; + debug!("wal receiver shutdown confirmed"); + + info!("waiting for timeline tasks to shutdown"); + task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await; + // No timeout here, GC & Compaction should be responsive to the `TimelineState::Stopping` change. + info!("waiting for layer_removal_cs.lock()"); let layer_removal_guard = timeline.layer_removal_cs.lock().await; info!("got layer_removal_cs.lock(), deleting layer files"); diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 85be420cb8..e4e9d0c6e8 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -262,27 +262,6 @@ pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Resul } pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> { - // Start with the shutdown of timeline tasks (this shuts down the walreceiver) - // It is important that we do not take locks here, and do not check whether the timeline exists - // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join - // we cannot create new timelines and tenants, and that can take quite some time, - // it can even become stuck due to a bug making whole pageserver unavailable for some operations - // so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation - // and then try to actually remove timeline from inmemory state and this is the point when concurrent requests - // will synchronize and either fail with the not found error or succeed - - debug!("waiting for wal receiver to shutdown"); - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(tenant_id), - Some(timeline_id), - ) - .await; - debug!("wal receiver shutdown confirmed"); - - info!("waiting for timeline tasks to shutdown"); - task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await; - info!("timeline task shutdown completed"); match get_tenant(tenant_id, true).await { Ok(tenant) => { tenant.delete_timeline(timeline_id).await?;