From deb86c1ea17daeef839f3adb43976670ee329848 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 19 Jul 2024 14:43:49 +0000 Subject: [PATCH] remodel the return type --- pageserver/src/tenant/mgr.rs | 71 ++++++++++-------- .../src/tenant/timeline/detach_ancestor.rs | 75 ++++++++++++++----- 2 files changed, 96 insertions(+), 50 deletions(-) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index e3a827cbe8..a1990234d4 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2024,61 +2024,66 @@ impl TenantManager { let timeline = tenant.get_timeline(timeline_id, true)?; - let (resp, reparented_all) = timeline + let resp = timeline .detach_from_ancestor_and_reparent(&tenant, prepared, ctx) .await?; let mut slot_guard = slot_guard.into_inner(); - attempt.before_shutdown(); + let tenant = if resp.reset_tenant_required() { + attempt.before_shutdown(); - let (_guard, progress) = utils::completion::channel(); - match tenant.shutdown(progress, ShutdownMode::Hard).await { - Ok(()) => { - slot_guard.drop_old_value()?; + let (_guard, progress) = utils::completion::channel(); + match tenant.shutdown(progress, ShutdownMode::Hard).await { + Ok(()) => { + slot_guard.drop_old_value()?; + } + Err(_barrier) => { + slot_guard.revert(); + // this really should not happen, at all, unless shutdown was already going? + anyhow::bail!("Cannot restart Tenant, already shutting down"); + } } - Err(_barrier) => { - slot_guard.revert(); - // this really should not happen, at all, unless shutdown was already going? - anyhow::bail!("Cannot restart Tenant, already shutting down"); - } - } - let tenant_path = self.conf.tenant_path(&tenant_shard_id); - let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?; + let tenant_path = self.conf.tenant_path(&tenant_shard_id); + let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?; - let shard_identity = config.shard; - let tenant = tenant_spawn( - self.conf, - tenant_shard_id, - &tenant_path, - self.resources.clone(), - AttachedTenantConf::try_from(config)?, - shard_identity, - None, - SpawnMode::Eager, - Some(&attempt), - ctx, - ); + let shard_identity = config.shard; + let tenant = tenant_spawn( + self.conf, + tenant_shard_id, + &tenant_path, + self.resources.clone(), + AttachedTenantConf::try_from(config)?, + shard_identity, + None, + SpawnMode::Eager, + Some(&attempt), + ctx, + ); - slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?; + slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?; + tenant + } else { + tracing::info!("skipping tenant_reset as no changes made required it"); + tenant + }; - if reparented_all { + if let Some(reparented) = resp.completed() { // finally ask the restarted tenant to complete the detach tenant .ongoing_timeline_detach .complete(attempt, &tenant) .await?; + Ok(reparented) } else { // at least the latest versions have now been downloaded and refreshed; be ready to // retry another time. tenant.ongoing_timeline_detach.cancel(attempt); - return Err(anyhow::anyhow!( + Err(anyhow::anyhow!( "failed to reparent all candidate timelines, please retry" - )); + )) } - - Ok(resp) } /// A page service client sends a TenantId, and to look up the correct Tenant we must diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 5b9201dfa4..6a56129577 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -308,9 +308,12 @@ impl SharedState { }; // the gate being antered does not matter much, but lets be strict - assert!(attempt.gate_entered.is_none()); - let entered = timeline.gate.enter().map_err(|_| Error::ShuttingDown)?; - attempt.gate_entered = Some(entered); + if attempt.gate_entered.is_none() { + let entered = timeline.gate.enter().map_err(|_| Error::ShuttingDown)?; + attempt.gate_entered = Some(entered); + } else { + // Some(gate_entered) means the tenant was not restarted, as is not required + } // this should be an 503 at least...? fail::fail_point!( @@ -970,13 +973,48 @@ async fn remote_copy( .map_err(CopyFailed) } +pub(crate) enum DetachingAndReparenting { + /// All of the following timeline ids were reparented and the timeline ancestor detach must be + /// marked as completed. + Reparented(Vec), + + /// Some of the reparentings failed. The timeline ancestor detach must **not** be marked as + /// completed. + /// + /// Nested `must_restart` is set to true when any restart requiring changes were made. + SomeReparentingFailed { must_restart: bool }, + + /// Detaching and reparentings were completed in a previous attempt. Timeline ancestor detach + /// must be marked as completed. + AlreadyDone(Vec), +} + +impl DetachingAndReparenting { + pub(crate) fn reset_tenant_required(&self) -> bool { + use DetachingAndReparenting::*; + match self { + Reparented(_) => true, + SomeReparentingFailed { must_restart } => *must_restart, + AlreadyDone(_) => false, + } + } + + pub(crate) fn completed(self) -> Option> { + use DetachingAndReparenting::*; + match self { + Reparented(x) | AlreadyDone(x) => Some(x), + SomeReparentingFailed { .. } => None, + } + } +} + /// See [`Timeline::detach_from_ancestor_and_reparent`]. pub(super) async fn detach_and_reparent( detached: &Arc, tenant: &Tenant, prepared: PreparedTimelineDetach, _ctx: &RequestContext, -) -> Result<(Vec, bool), anyhow::Error> { +) -> Result { let PreparedTimelineDetach { layers } = prepared; #[derive(Debug)] @@ -1016,21 +1054,24 @@ pub(super) async fn detach_and_reparent( if let Some(ancestor) = existing { Ancestor::Detached(ancestor, ancestor_lsn) } else { - return Ok((reparented_direct_children(detached, tenant)?, true)); + let direct_children = reparented_direct_children(detached, tenant)?; + return Ok(DetachingAndReparenting::AlreadyDone(direct_children)); } } else { + // TODO: make sure there are no `?` before tenant_reset from after a questionmark from + // here. panic!("bug: complete called on a timeline which has not been detached or which has no live ancestor"); }; // publish the prepared layers before we reparent any of the timelines, so that on restart // reparented timelines find layers. also do the actual detaching. // - // if we crash after this operation, we will at least come up having detached a timeline, but - // we cannot go back and reparent the timelines which would had been reparented in normal - // execution. - // - // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart - // which could give us a completely wrong layer combination. + // if we crash after this operation, a retry will allow reparenting the remaining timelines as + // gc is blocked. + assert!( + detach_is_ongoing, + "to detach and reparent, gc must still be blocked" + ); let (ancestor, ancestor_lsn) = match ancestor { Ancestor::NotDetached(ancestor, ancestor_lsn) => { @@ -1050,7 +1091,6 @@ pub(super) async fn detach_and_reparent( Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn), }; - assert!(detach_is_ongoing, "to reparent, gc must still be blocked"); let mut tasks = tokio::task::JoinSet::new(); // Returns a single permit semaphore which will be used to make one reparenting succeed, @@ -1124,8 +1164,8 @@ pub(super) async fn detach_and_reparent( Some(timeline) } Err(e) => { - // with the use of tenant slot, timeline deletion is the most likely - // reason. + // with the use of tenant slot, raced timeline deletion is the most + // likely reason. tracing::warn!("reparenting failed: {e:#}"); None } @@ -1176,9 +1216,10 @@ pub(super) async fn detach_and_reparent( .map(|(_, timeline_id)| timeline_id) .collect(); - Ok((reparented, true)) + Ok(DetachingAndReparenting::Reparented(reparented)) } else { - // TODO: two-state Ok(return_value)? - Ok((Vec::new(), false)) + Ok(DetachingAndReparenting::SomeReparentingFailed { + must_restart: reparented.is_empty(), + }) } }