From bb3d70e24d761f423df2aeb1d56d27d2475ac1cd Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 19 Jul 2024 09:53:09 +0000 Subject: [PATCH] fix: properly cancel if any reparenting failed --- pageserver/src/tenant/mgr.rs | 1 + .../src/tenant/timeline/detach_ancestor.rs | 20 ++++++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 31ca6f8fff..8da2ce84e6 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2072,6 +2072,7 @@ impl TenantManager { } else { // at least the latest versions have now been downloaded and refreshed; be ready to // retry another time. + tenant.ongoing_timeline_detach.cancel(attempt); return Err(anyhow::anyhow!( "failed to reparent all candidate timelines, please retry" )); diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index aad19f89b4..e1a0d3d0ca 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -281,18 +281,20 @@ impl SharedState { g.validate(&attempt); } - let attempt = scopeguard::guard(attempt, |attempt| { + let mut attempt = scopeguard::guard(attempt, |attempt| { // our attempt will no longer be valid, so release it self.inner.lock().unwrap().cancel(attempt); }); - // no failpoint needed here, because the next one is the first mutating - tenant .wait_to_become_active(std::time::Duration::from_secs(9999)) .await .map_err(Error::WaitToActivate)?; + // TODO: pause failpoint here to catch the situation where detached timeline is deleted...? + // we are not yet holding the gate so it could advance to the point of removing from + // timelines. + let Some(timeline) = tenant .timelines .lock() @@ -300,9 +302,15 @@ impl SharedState { .get(&attempt.timeline_id) .cloned() else { + // FIXME: this needs a test case ... basically deletion right after activation? unreachable!("unsure if there is an ordering, but perhaps this is possible?"); }; + // the gate being antered does not matter much, but lets be strict + assert!(attempt.gate_entered.is_none()); + let entered = timeline.gate.enter().map_err(|_| Error::ShuttingDown)?; + attempt.gate_entered = Some(entered); + // this should be an 503 at least...? fail::fail_point!( "timeline-detach-ancestor::complete_before_uploading", @@ -324,6 +332,12 @@ impl SharedState { Ok(()) } + + pub(crate) fn cancel(&self, attempt: Attempt) { + let mut g = self.inner.lock().unwrap(); + g.cancel(attempt); + tracing::info!("keeping the gc blocking for retried detach_ancestor"); + } } #[derive(Default)]