fix: properly cancel if any reparenting failed

This commit is contained in:
Joonas Koivunen
2024-07-19 09:53:09 +00:00
parent c6c560e4c8
commit bb3d70e24d
2 changed files with 18 additions and 3 deletions

View File

@@ -2072,6 +2072,7 @@ impl TenantManager {
} else {
// at least the latest versions have now been downloaded and refreshed; be ready to
// retry another time.
tenant.ongoing_timeline_detach.cancel(attempt);
return Err(anyhow::anyhow!(
"failed to reparent all candidate timelines, please retry"
));

View File

@@ -281,18 +281,20 @@ impl SharedState {
g.validate(&attempt);
}
let attempt = scopeguard::guard(attempt, |attempt| {
let mut attempt = scopeguard::guard(attempt, |attempt| {
// our attempt will no longer be valid, so release it
self.inner.lock().unwrap().cancel(attempt);
});
// no failpoint needed here, because the next one is the first mutating
tenant
.wait_to_become_active(std::time::Duration::from_secs(9999))
.await
.map_err(Error::WaitToActivate)?;
// TODO: pause failpoint here to catch the situation where detached timeline is deleted...?
// we are not yet holding the gate so it could advance to the point of removing from
// timelines.
let Some(timeline) = tenant
.timelines
.lock()
@@ -300,9 +302,15 @@ impl SharedState {
.get(&attempt.timeline_id)
.cloned()
else {
// FIXME: this needs a test case ... basically deletion right after activation?
unreachable!("unsure if there is an ordering, but perhaps this is possible?");
};
// the gate being antered does not matter much, but lets be strict
assert!(attempt.gate_entered.is_none());
let entered = timeline.gate.enter().map_err(|_| Error::ShuttingDown)?;
attempt.gate_entered = Some(entered);
// this should be an 503 at least...?
fail::fail_point!(
"timeline-detach-ancestor::complete_before_uploading",
@@ -324,6 +332,12 @@ impl SharedState {
Ok(())
}
pub(crate) fn cancel(&self, attempt: Attempt) {
let mut g = self.inner.lock().unwrap();
g.cancel(attempt);
tracing::info!("keeping the gc blocking for retried detach_ancestor");
}
}
#[derive(Default)]