Move code from tenant_mgr::delete_timeline to Tenant::delete_timeline.

It's better to request the tasks to shut down only after setting the
timeline state to Stopping. Otherwise, it's possible that a new task
spawns after we have waited for the existing tasks to shut down, but
before we have changed the state. We would fail to wait for them.

Feels nicer from a readability point of view too.
This commit is contained in:
Heikki Linnakangas
2022-12-20 01:42:54 +02:00
committed by Heikki Linnakangas
parent 0c71dc627b
commit 9a049aa846
2 changed files with 22 additions and 22 deletions

View File

@@ -1274,8 +1274,29 @@ impl Tenant {
timeline
};
info!("waiting for layer_removal_cs.lock()");
// Now that the Timeline is in Stopping state, request all the related tasks to
// shut down.
//
// NB: If you call delete_timeline multiple times concurrently, they will
// all go through the motions here. Make sure the code here is idempotent,
// and don't error out if some of the shutdown tasks have already been
// completed!
// Stop the walreceiver first.
debug!("waiting for wal receiver to shutdown");
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(self.tenant_id),
Some(timeline_id),
)
.await;
debug!("wal receiver shutdown confirmed");
info!("waiting for timeline tasks to shutdown");
task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;
// No timeout here, GC & Compaction should be responsive to the `TimelineState::Stopping` change.
info!("waiting for layer_removal_cs.lock()");
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
info!("got layer_removal_cs.lock(), deleting layer files");

View File

@@ -262,27 +262,6 @@ pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Resul
}
pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> {
// Start with the shutdown of timeline tasks (this shuts down the walreceiver)
// It is important that we do not take locks here, and do not check whether the timeline exists
// because if we hold tenants_state::write_tenants() while awaiting for the tasks to join
// we cannot create new timelines and tenants, and that can take quite some time,
// it can even become stuck due to a bug making whole pageserver unavailable for some operations
// so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation
// and then try to actually remove timeline from inmemory state and this is the point when concurrent requests
// will synchronize and either fail with the not found error or succeed
debug!("waiting for wal receiver to shutdown");
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(tenant_id),
Some(timeline_id),
)
.await;
debug!("wal receiver shutdown confirmed");
info!("waiting for timeline tasks to shutdown");
task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await;
info!("timeline task shutdown completed");
match get_tenant(tenant_id, true).await {
Ok(tenant) => {
tenant.delete_timeline(timeline_id).await?;