mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 14:02:55 +00:00
Rename Paused states to Stopping.
I'm not a fan of "Paused", for two reasons: - Paused implies that the tenant/timeline with no activity on it. That's not true; the tenant/timeline can still have active tasks working on it. - Paused implies that it can be resumed later. It can not. A tenant or timeline in this state cannot be switched back to Active state anymore. A completely new Tenant or Timeline struct can be constructed for the same tenant or timeline later, e.g. if you detach and later re-attach the same tenant, but that's a different thing. Stopping describes the state better. I also considered "ShuttingDown", but Stopping is simpler as it's a single word.
This commit is contained in:
committed by
Heikki Linnakangas
parent
9a6c0be823
commit
33834c01ec
@@ -23,7 +23,7 @@ pub enum TenantState {
|
||||
Active,
|
||||
/// A tenant is recognized by pageserver, but it is being detached or the
|
||||
/// system is being shut down.
|
||||
Paused,
|
||||
Stopping,
|
||||
/// A tenant is recognized by the pageserver, but can no longer be used for
|
||||
/// any operations, because it failed to be activated.
|
||||
Broken,
|
||||
@@ -35,7 +35,7 @@ impl TenantState {
|
||||
Self::Loading => true,
|
||||
Self::Attaching => true,
|
||||
Self::Active => false,
|
||||
Self::Paused => false,
|
||||
Self::Stopping => false,
|
||||
Self::Broken => false,
|
||||
}
|
||||
}
|
||||
@@ -53,7 +53,7 @@ pub enum TimelineState {
|
||||
Suspended,
|
||||
/// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
|
||||
/// automatically become Active after certain events: only a management call can change this status.
|
||||
Paused,
|
||||
Stopping,
|
||||
/// A timeline is recognized by the pageserver, but can no longer be used for
|
||||
/// any operations, because it failed to be activated.
|
||||
Broken,
|
||||
|
||||
@@ -1248,7 +1248,7 @@ impl Tenant {
|
||||
|
||||
/// Removes timeline-related in-memory data
|
||||
pub async fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> {
|
||||
// Transition the timeline into TimelineState::Paused.
|
||||
// Transition the timeline into TimelineState::Stopping.
|
||||
// This should prevent new operations from starting.
|
||||
let timeline = {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
@@ -1269,14 +1269,14 @@ impl Tenant {
|
||||
};
|
||||
|
||||
let timeline = Arc::clone(timeline_entry.get());
|
||||
timeline.set_state(TimelineState::Paused);
|
||||
timeline.set_state(TimelineState::Stopping);
|
||||
|
||||
drop(timelines);
|
||||
timeline
|
||||
};
|
||||
|
||||
info!("waiting for layer_removal_cs.lock()");
|
||||
// No timeout here, GC & Compaction should be responsive to the `TimelineState::Paused` change.
|
||||
// No timeout here, GC & Compaction should be responsive to the `TimelineState::Stopping` change.
|
||||
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
|
||||
info!("got layer_removal_cs.lock(), deleting layer files");
|
||||
|
||||
@@ -1301,7 +1301,7 @@ impl Tenant {
|
||||
let children_exist = timelines
|
||||
.iter()
|
||||
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
|
||||
// XXX this can happen because `branch_timeline` doesn't check `TimelineState::Paused`.
|
||||
// XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
|
||||
// We already deleted the layer files, so it's probably best to panic.
|
||||
// (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
|
||||
if children_exist {
|
||||
@@ -1355,10 +1355,10 @@ impl Tenant {
|
||||
"Could not activate tenant because it is in broken state"
|
||||
));
|
||||
}
|
||||
TenantState::Paused => {
|
||||
TenantState::Stopping => {
|
||||
// The tenant was detached, or system shutdown was requested, while we were
|
||||
// loading or attaching the tenant.
|
||||
info!("Tenant is already in Paused state, skipping activation");
|
||||
info!("Tenant is already in Stopping state, skipping activation");
|
||||
}
|
||||
TenantState::Loading | TenantState::Attaching => {
|
||||
*current_state = TenantState::Active;
|
||||
@@ -1384,16 +1384,16 @@ impl Tenant {
|
||||
result
|
||||
}
|
||||
|
||||
/// Change tenant status to paused, to mark that it is being shut down
|
||||
pub fn set_paused(&self) {
|
||||
/// Change tenant status to Stopping, to mark that it is being shut down
|
||||
pub fn set_stopping(&self) {
|
||||
self.state.send_modify(|current_state| {
|
||||
match *current_state {
|
||||
TenantState::Active | TenantState::Loading | TenantState::Attaching => {
|
||||
*current_state = TenantState::Paused;
|
||||
*current_state = TenantState::Stopping;
|
||||
|
||||
// FIXME: If the tenant is still Loading or Attaching, new timelines
|
||||
// might be created after this. That's harmless, as the Timelines
|
||||
// won't be accessible to anyone, when the Tenant is in Paused
|
||||
// won't be accessible to anyone, when the Tenant is in Stopping
|
||||
// state.
|
||||
let timelines_accessor = self.timelines.lock().unwrap();
|
||||
let not_broken_timelines = timelines_accessor
|
||||
@@ -1404,12 +1404,12 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
TenantState::Broken => {
|
||||
info!("Cannot set tenant to Paused state, it is already in Broken state");
|
||||
info!("Cannot set tenant to Stopping state, it is already in Broken state");
|
||||
}
|
||||
TenantState::Paused => {
|
||||
TenantState::Stopping => {
|
||||
// The tenant was detached, or system shutdown was requested, while we were
|
||||
// loading or attaching the tenant.
|
||||
info!("Tenant is already in Paused state");
|
||||
info!("Tenant is already in Stopping state");
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -1430,10 +1430,10 @@ impl Tenant {
|
||||
// This shouldn't happen either
|
||||
warn!("Tenant is already broken");
|
||||
}
|
||||
TenantState::Paused => {
|
||||
TenantState::Stopping => {
|
||||
// This shouldn't happen either
|
||||
*current_state = TenantState::Broken;
|
||||
warn!("Marking Paused tenant as Broken");
|
||||
warn!("Marking Stopping tenant as Broken");
|
||||
}
|
||||
TenantState::Loading | TenantState::Attaching => {
|
||||
*current_state = TenantState::Broken;
|
||||
@@ -1458,7 +1458,7 @@ impl Tenant {
|
||||
TenantState::Active { .. } => {
|
||||
return Ok(());
|
||||
}
|
||||
TenantState::Broken | TenantState::Paused => {
|
||||
TenantState::Broken | TenantState::Stopping => {
|
||||
// There's no chance the tenant can transition back into ::Active
|
||||
anyhow::bail!(
|
||||
"Tenant {} will not become active. Current state: {:?}",
|
||||
|
||||
@@ -555,8 +555,8 @@ impl Timeline {
|
||||
let _layer_removal_cs = self.layer_removal_cs.lock().await;
|
||||
// Is the timeline being deleted?
|
||||
let state = *self.state.borrow();
|
||||
if state == TimelineState::Paused {
|
||||
anyhow::bail!("timeline is paused: {:?}", state);
|
||||
if state == TimelineState::Stopping {
|
||||
anyhow::bail!("timeline is Stopping");
|
||||
}
|
||||
|
||||
let target_file_size = self.get_checkpoint_distance();
|
||||
@@ -668,8 +668,8 @@ impl Timeline {
|
||||
(TimelineState::Broken, _) => {
|
||||
error!("Ignoring state update {new_state:?} for broken tenant");
|
||||
}
|
||||
(TimelineState::Paused, TimelineState::Active) => {
|
||||
debug!("Not activating a paused timeline");
|
||||
(TimelineState::Stopping, TimelineState::Active) => {
|
||||
debug!("Not activating a Stopping timeline");
|
||||
}
|
||||
(_, new_state) => {
|
||||
self.state.send_replace(new_state);
|
||||
@@ -1251,7 +1251,7 @@ impl Timeline {
|
||||
match new_state {
|
||||
// we're running this job for active timelines only
|
||||
TimelineState::Active => continue,
|
||||
TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return Some(new_state),
|
||||
TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return Some(new_state),
|
||||
}
|
||||
}
|
||||
Err(_sender_dropped_error) => return None,
|
||||
@@ -2393,8 +2393,8 @@ impl Timeline {
|
||||
let _layer_removal_cs = self.layer_removal_cs.lock().await;
|
||||
// Is the timeline being deleted?
|
||||
let state = *self.state.borrow();
|
||||
if state == TimelineState::Paused {
|
||||
anyhow::bail!("timeline is paused: {:?}", state);
|
||||
if state == TimelineState::Stopping {
|
||||
anyhow::bail!("timeline is Stopping");
|
||||
}
|
||||
|
||||
let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
|
||||
|
||||
@@ -170,7 +170,7 @@ pub async fn shutdown_all_tenants() {
|
||||
for (_, tenant) in m.drain() {
|
||||
if tenant.is_active() {
|
||||
// updates tenant state, forbidding new GC and compaction iterations from starting
|
||||
tenant.set_paused();
|
||||
tenant.set_stopping();
|
||||
tenants_to_shut_down.push(tenant)
|
||||
}
|
||||
}
|
||||
@@ -310,7 +310,7 @@ pub async fn detach_tenant(
|
||||
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
|
||||
};
|
||||
|
||||
tenant.set_paused();
|
||||
tenant.set_stopping();
|
||||
// shutdown all tenant and timeline tasks: gc, compaction, page service)
|
||||
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
|
||||
|
||||
|
||||
@@ -214,7 +214,7 @@ async fn connection_manager_loop_step(
|
||||
match new_state {
|
||||
// we're already active as walreceiver, no need to reactivate
|
||||
TimelineState::Active => continue,
|
||||
TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return ControlFlow::Continue(new_state),
|
||||
TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return ControlFlow::Continue(new_state),
|
||||
}
|
||||
}
|
||||
Err(_sender_dropped_error) => return ControlFlow::Break(()),
|
||||
|
||||
@@ -1765,8 +1765,8 @@ class NeonPageserver(PgProtocol):
|
||||
# FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946
|
||||
".*could not flush frozen layer.*queue is in state Stopped", # when schedule layer upload fails because queued got closed before compaction got killed
|
||||
".*wait for layer upload ops to complete.*", # .*Caused by:.*wait_completion aborted because upload queue was stopped
|
||||
".*gc_loop.*Gc failed, retrying in.*timeline is paused: Paused", # When gc checks timeline state after acquiring layer_removal_cs
|
||||
".*compaction_loop.*Compaction failed, retrying in.*timeline is paused: Paused", # When compaction checks timeline state after acquiring layer_removal_cs
|
||||
".*gc_loop.*Gc failed, retrying in.*timeline is Stopping", # When gc checks timeline state after acquiring layer_removal_cs
|
||||
".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs
|
||||
".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
|
||||
]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user