From 9609f7547ea6aba294e3614aa047cbe5f209f15f Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 7 Feb 2025 15:29:34 +0000 Subject: [PATCH] tests: address warnings in timeline shutdown (#10702) ## Problem There are a couple of log warnings tripping up `test_timeline_archival_chaos` - `[stopping left-over name="timeline_delete" tenant_shard_id=2d526292b67dac0e6425266d7079c253 timeline_id=Some(44ba36bfdee5023672c93778985facd9) kind=TimelineDeletionWorker\n')](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-10672/13161357302/index.html#/testresult/716b997bb1d8a021)` - `ignoring attempt to restart exited flush_loop 503d8f401d8887cfaae873040a6cc193/d5eed0673ba37d8992f7ec411363a7e3\n')` Related: https://github.com/neondatabase/neon/issues/10389 ## Summary of changes - Downgrade the 'ignoring attempt to restart' to info -- there's nothing in the design that forbids this happening, i.e. someone calling maybe_spawn_flush_loop concurrently with shutdown() - Prevent timeline deletion tasks outliving tenants by carrying a gateguard. This logically makes sense because the deletion process does call into Tenant to update manifests. --- pageserver/src/tenant/timeline.rs | 2 +- pageserver/src/tenant/timeline/delete.rs | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 770ea418d1..2be6fc1e59 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2633,7 +2633,7 @@ impl Timeline { return; } FlushLoopState::Exited => { - warn!( + info!( "ignoring attempt to restart exited flush_loop {}/{}", self.tenant_shard_id, self.timeline_id ); diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 3c828c8a9e..5eb2d3aa24 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -341,6 +341,13 @@ impl DeleteTimelineFlow { let tenant_shard_id = timeline.tenant_shard_id(); let timeline_id = timeline.timeline_id(); + // Take a tenant gate guard, because timeline deletion needs access to the tenant to update its manifest. + let Ok(tenant_guard) = tenant.gate.enter() else { + // It is safe to simply skip here, because we only schedule background work once the timeline is durably marked for deletion. + info!("Tenant is shutting down, timeline deletion will be resumed when it next starts"); + return; + }; + task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, @@ -348,6 +355,8 @@ impl DeleteTimelineFlow { Some(timeline_id), "timeline_delete", async move { + let _guard = tenant_guard; + if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await { // Only log as an error if it's not a cancellation. if matches!(err, DeleteTimelineError::Cancelled) {