mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-15 09:22:55 +00:00
remodel the return type
This commit is contained in:
@@ -2024,61 +2024,66 @@ impl TenantManager {
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
let (resp, reparented_all) = timeline
|
||||
let resp = timeline
|
||||
.detach_from_ancestor_and_reparent(&tenant, prepared, ctx)
|
||||
.await?;
|
||||
|
||||
let mut slot_guard = slot_guard.into_inner();
|
||||
|
||||
attempt.before_shutdown();
|
||||
let tenant = if resp.reset_tenant_required() {
|
||||
attempt.before_shutdown();
|
||||
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||
Ok(()) => {
|
||||
slot_guard.drop_old_value()?;
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||
Ok(()) => {
|
||||
slot_guard.drop_old_value()?;
|
||||
}
|
||||
Err(_barrier) => {
|
||||
slot_guard.revert();
|
||||
// this really should not happen, at all, unless shutdown was already going?
|
||||
anyhow::bail!("Cannot restart Tenant, already shutting down");
|
||||
}
|
||||
}
|
||||
Err(_barrier) => {
|
||||
slot_guard.revert();
|
||||
// this really should not happen, at all, unless shutdown was already going?
|
||||
anyhow::bail!("Cannot restart Tenant, already shutting down");
|
||||
}
|
||||
}
|
||||
|
||||
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
|
||||
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
|
||||
|
||||
let shard_identity = config.shard;
|
||||
let tenant = tenant_spawn(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(config)?,
|
||||
shard_identity,
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
Some(&attempt),
|
||||
ctx,
|
||||
);
|
||||
let shard_identity = config.shard;
|
||||
let tenant = tenant_spawn(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(config)?,
|
||||
shard_identity,
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
Some(&attempt),
|
||||
ctx,
|
||||
);
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?;
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?;
|
||||
tenant
|
||||
} else {
|
||||
tracing::info!("skipping tenant_reset as no changes made required it");
|
||||
tenant
|
||||
};
|
||||
|
||||
if reparented_all {
|
||||
if let Some(reparented) = resp.completed() {
|
||||
// finally ask the restarted tenant to complete the detach
|
||||
tenant
|
||||
.ongoing_timeline_detach
|
||||
.complete(attempt, &tenant)
|
||||
.await?;
|
||||
Ok(reparented)
|
||||
} else {
|
||||
// at least the latest versions have now been downloaded and refreshed; be ready to
|
||||
// retry another time.
|
||||
tenant.ongoing_timeline_detach.cancel(attempt);
|
||||
return Err(anyhow::anyhow!(
|
||||
Err(anyhow::anyhow!(
|
||||
"failed to reparent all candidate timelines, please retry"
|
||||
));
|
||||
))
|
||||
}
|
||||
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// A page service client sends a TenantId, and to look up the correct Tenant we must
|
||||
|
||||
@@ -308,9 +308,12 @@ impl SharedState {
|
||||
};
|
||||
|
||||
// the gate being antered does not matter much, but lets be strict
|
||||
assert!(attempt.gate_entered.is_none());
|
||||
let entered = timeline.gate.enter().map_err(|_| Error::ShuttingDown)?;
|
||||
attempt.gate_entered = Some(entered);
|
||||
if attempt.gate_entered.is_none() {
|
||||
let entered = timeline.gate.enter().map_err(|_| Error::ShuttingDown)?;
|
||||
attempt.gate_entered = Some(entered);
|
||||
} else {
|
||||
// Some(gate_entered) means the tenant was not restarted, as is not required
|
||||
}
|
||||
|
||||
// this should be an 503 at least...?
|
||||
fail::fail_point!(
|
||||
@@ -970,13 +973,48 @@ async fn remote_copy(
|
||||
.map_err(CopyFailed)
|
||||
}
|
||||
|
||||
pub(crate) enum DetachingAndReparenting {
|
||||
/// All of the following timeline ids were reparented and the timeline ancestor detach must be
|
||||
/// marked as completed.
|
||||
Reparented(Vec<TimelineId>),
|
||||
|
||||
/// Some of the reparentings failed. The timeline ancestor detach must **not** be marked as
|
||||
/// completed.
|
||||
///
|
||||
/// Nested `must_restart` is set to true when any restart requiring changes were made.
|
||||
SomeReparentingFailed { must_restart: bool },
|
||||
|
||||
/// Detaching and reparentings were completed in a previous attempt. Timeline ancestor detach
|
||||
/// must be marked as completed.
|
||||
AlreadyDone(Vec<TimelineId>),
|
||||
}
|
||||
|
||||
impl DetachingAndReparenting {
|
||||
pub(crate) fn reset_tenant_required(&self) -> bool {
|
||||
use DetachingAndReparenting::*;
|
||||
match self {
|
||||
Reparented(_) => true,
|
||||
SomeReparentingFailed { must_restart } => *must_restart,
|
||||
AlreadyDone(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn completed(self) -> Option<Vec<TimelineId>> {
|
||||
use DetachingAndReparenting::*;
|
||||
match self {
|
||||
Reparented(x) | AlreadyDone(x) => Some(x),
|
||||
SomeReparentingFailed { .. } => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`Timeline::detach_from_ancestor_and_reparent`].
|
||||
pub(super) async fn detach_and_reparent(
|
||||
detached: &Arc<Timeline>,
|
||||
tenant: &Tenant,
|
||||
prepared: PreparedTimelineDetach,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<(Vec<TimelineId>, bool), anyhow::Error> {
|
||||
) -> Result<DetachingAndReparenting, anyhow::Error> {
|
||||
let PreparedTimelineDetach { layers } = prepared;
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -1016,21 +1054,24 @@ pub(super) async fn detach_and_reparent(
|
||||
if let Some(ancestor) = existing {
|
||||
Ancestor::Detached(ancestor, ancestor_lsn)
|
||||
} else {
|
||||
return Ok((reparented_direct_children(detached, tenant)?, true));
|
||||
let direct_children = reparented_direct_children(detached, tenant)?;
|
||||
return Ok(DetachingAndReparenting::AlreadyDone(direct_children));
|
||||
}
|
||||
} else {
|
||||
// TODO: make sure there are no `?` before tenant_reset from after a questionmark from
|
||||
// here.
|
||||
panic!("bug: complete called on a timeline which has not been detached or which has no live ancestor");
|
||||
};
|
||||
|
||||
// publish the prepared layers before we reparent any of the timelines, so that on restart
|
||||
// reparented timelines find layers. also do the actual detaching.
|
||||
//
|
||||
// if we crash after this operation, we will at least come up having detached a timeline, but
|
||||
// we cannot go back and reparent the timelines which would had been reparented in normal
|
||||
// execution.
|
||||
//
|
||||
// this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
|
||||
// which could give us a completely wrong layer combination.
|
||||
// if we crash after this operation, a retry will allow reparenting the remaining timelines as
|
||||
// gc is blocked.
|
||||
assert!(
|
||||
detach_is_ongoing,
|
||||
"to detach and reparent, gc must still be blocked"
|
||||
);
|
||||
|
||||
let (ancestor, ancestor_lsn) = match ancestor {
|
||||
Ancestor::NotDetached(ancestor, ancestor_lsn) => {
|
||||
@@ -1050,7 +1091,6 @@ pub(super) async fn detach_and_reparent(
|
||||
Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn),
|
||||
};
|
||||
|
||||
assert!(detach_is_ongoing, "to reparent, gc must still be blocked");
|
||||
let mut tasks = tokio::task::JoinSet::new();
|
||||
|
||||
// Returns a single permit semaphore which will be used to make one reparenting succeed,
|
||||
@@ -1124,8 +1164,8 @@ pub(super) async fn detach_and_reparent(
|
||||
Some(timeline)
|
||||
}
|
||||
Err(e) => {
|
||||
// with the use of tenant slot, timeline deletion is the most likely
|
||||
// reason.
|
||||
// with the use of tenant slot, raced timeline deletion is the most
|
||||
// likely reason.
|
||||
tracing::warn!("reparenting failed: {e:#}");
|
||||
None
|
||||
}
|
||||
@@ -1176,9 +1216,10 @@ pub(super) async fn detach_and_reparent(
|
||||
.map(|(_, timeline_id)| timeline_id)
|
||||
.collect();
|
||||
|
||||
Ok((reparented, true))
|
||||
Ok(DetachingAndReparenting::Reparented(reparented))
|
||||
} else {
|
||||
// TODO: two-state Ok(return_value)?
|
||||
Ok((Vec::new(), false))
|
||||
Ok(DetachingAndReparenting::SomeReparentingFailed {
|
||||
must_restart: reparented.is_empty(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user