remodel the return type

This commit is contained in:
Joonas Koivunen
2024-07-19 14:43:49 +00:00
parent dfdf40916f
commit deb86c1ea1
2 changed files with 96 additions and 50 deletions

View File

@@ -2024,61 +2024,66 @@ impl TenantManager {
let timeline = tenant.get_timeline(timeline_id, true)?;
let (resp, reparented_all) = timeline
let resp = timeline
.detach_from_ancestor_and_reparent(&tenant, prepared, ctx)
.await?;
let mut slot_guard = slot_guard.into_inner();
attempt.before_shutdown();
let tenant = if resp.reset_tenant_required() {
attempt.before_shutdown();
let (_guard, progress) = utils::completion::channel();
match tenant.shutdown(progress, ShutdownMode::Hard).await {
Ok(()) => {
slot_guard.drop_old_value()?;
let (_guard, progress) = utils::completion::channel();
match tenant.shutdown(progress, ShutdownMode::Hard).await {
Ok(()) => {
slot_guard.drop_old_value()?;
}
Err(_barrier) => {
slot_guard.revert();
// this really should not happen, at all, unless shutdown was already going?
anyhow::bail!("Cannot restart Tenant, already shutting down");
}
}
Err(_barrier) => {
slot_guard.revert();
// this really should not happen, at all, unless shutdown was already going?
anyhow::bail!("Cannot restart Tenant, already shutting down");
}
}
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
let shard_identity = config.shard;
let tenant = tenant_spawn(
self.conf,
tenant_shard_id,
&tenant_path,
self.resources.clone(),
AttachedTenantConf::try_from(config)?,
shard_identity,
None,
SpawnMode::Eager,
Some(&attempt),
ctx,
);
let shard_identity = config.shard;
let tenant = tenant_spawn(
self.conf,
tenant_shard_id,
&tenant_path,
self.resources.clone(),
AttachedTenantConf::try_from(config)?,
shard_identity,
None,
SpawnMode::Eager,
Some(&attempt),
ctx,
);
slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?;
slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?;
tenant
} else {
tracing::info!("skipping tenant_reset as no changes made required it");
tenant
};
if reparented_all {
if let Some(reparented) = resp.completed() {
// finally ask the restarted tenant to complete the detach
tenant
.ongoing_timeline_detach
.complete(attempt, &tenant)
.await?;
Ok(reparented)
} else {
// at least the latest versions have now been downloaded and refreshed; be ready to
// retry another time.
tenant.ongoing_timeline_detach.cancel(attempt);
return Err(anyhow::anyhow!(
Err(anyhow::anyhow!(
"failed to reparent all candidate timelines, please retry"
));
))
}
Ok(resp)
}
/// A page service client sends a TenantId, and to look up the correct Tenant we must

View File

@@ -308,9 +308,12 @@ impl SharedState {
};
// the gate being antered does not matter much, but lets be strict
assert!(attempt.gate_entered.is_none());
let entered = timeline.gate.enter().map_err(|_| Error::ShuttingDown)?;
attempt.gate_entered = Some(entered);
if attempt.gate_entered.is_none() {
let entered = timeline.gate.enter().map_err(|_| Error::ShuttingDown)?;
attempt.gate_entered = Some(entered);
} else {
// Some(gate_entered) means the tenant was not restarted, as is not required
}
// this should be an 503 at least...?
fail::fail_point!(
@@ -970,13 +973,48 @@ async fn remote_copy(
.map_err(CopyFailed)
}
pub(crate) enum DetachingAndReparenting {
/// All of the following timeline ids were reparented and the timeline ancestor detach must be
/// marked as completed.
Reparented(Vec<TimelineId>),
/// Some of the reparentings failed. The timeline ancestor detach must **not** be marked as
/// completed.
///
/// Nested `must_restart` is set to true when any restart requiring changes were made.
SomeReparentingFailed { must_restart: bool },
/// Detaching and reparentings were completed in a previous attempt. Timeline ancestor detach
/// must be marked as completed.
AlreadyDone(Vec<TimelineId>),
}
impl DetachingAndReparenting {
pub(crate) fn reset_tenant_required(&self) -> bool {
use DetachingAndReparenting::*;
match self {
Reparented(_) => true,
SomeReparentingFailed { must_restart } => *must_restart,
AlreadyDone(_) => false,
}
}
pub(crate) fn completed(self) -> Option<Vec<TimelineId>> {
use DetachingAndReparenting::*;
match self {
Reparented(x) | AlreadyDone(x) => Some(x),
SomeReparentingFailed { .. } => None,
}
}
}
/// See [`Timeline::detach_from_ancestor_and_reparent`].
pub(super) async fn detach_and_reparent(
detached: &Arc<Timeline>,
tenant: &Tenant,
prepared: PreparedTimelineDetach,
_ctx: &RequestContext,
) -> Result<(Vec<TimelineId>, bool), anyhow::Error> {
) -> Result<DetachingAndReparenting, anyhow::Error> {
let PreparedTimelineDetach { layers } = prepared;
#[derive(Debug)]
@@ -1016,21 +1054,24 @@ pub(super) async fn detach_and_reparent(
if let Some(ancestor) = existing {
Ancestor::Detached(ancestor, ancestor_lsn)
} else {
return Ok((reparented_direct_children(detached, tenant)?, true));
let direct_children = reparented_direct_children(detached, tenant)?;
return Ok(DetachingAndReparenting::AlreadyDone(direct_children));
}
} else {
// TODO: make sure there are no `?` before tenant_reset from after a questionmark from
// here.
panic!("bug: complete called on a timeline which has not been detached or which has no live ancestor");
};
// publish the prepared layers before we reparent any of the timelines, so that on restart
// reparented timelines find layers. also do the actual detaching.
//
// if we crash after this operation, we will at least come up having detached a timeline, but
// we cannot go back and reparent the timelines which would had been reparented in normal
// execution.
//
// this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
// which could give us a completely wrong layer combination.
// if we crash after this operation, a retry will allow reparenting the remaining timelines as
// gc is blocked.
assert!(
detach_is_ongoing,
"to detach and reparent, gc must still be blocked"
);
let (ancestor, ancestor_lsn) = match ancestor {
Ancestor::NotDetached(ancestor, ancestor_lsn) => {
@@ -1050,7 +1091,6 @@ pub(super) async fn detach_and_reparent(
Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn),
};
assert!(detach_is_ongoing, "to reparent, gc must still be blocked");
let mut tasks = tokio::task::JoinSet::new();
// Returns a single permit semaphore which will be used to make one reparenting succeed,
@@ -1124,8 +1164,8 @@ pub(super) async fn detach_and_reparent(
Some(timeline)
}
Err(e) => {
// with the use of tenant slot, timeline deletion is the most likely
// reason.
// with the use of tenant slot, raced timeline deletion is the most
// likely reason.
tracing::warn!("reparenting failed: {e:#}");
None
}
@@ -1176,9 +1216,10 @@ pub(super) async fn detach_and_reparent(
.map(|(_, timeline_id)| timeline_id)
.collect();
Ok((reparented, true))
Ok(DetachingAndReparenting::Reparented(reparented))
} else {
// TODO: two-state Ok(return_value)?
Ok((Vec::new(), false))
Ok(DetachingAndReparenting::SomeReparentingFailed {
must_restart: reparented.is_empty(),
})
}
}