Rename Paused states to Stopping.

I'm not a fan of "Paused", for two reasons:

- Paused implies that the tenant/timeline with no activity on it. That's
  not true; the tenant/timeline can still have active tasks working on it.

- Paused implies that it can be resumed later. It can not. A tenant or
  timeline in this state cannot be switched back to Active state anymore.
  A completely new Tenant or Timeline struct can be constructed for the
  same tenant or timeline later, e.g. if you detach and later re-attach
  the same tenant, but that's a different thing.

Stopping describes the state better. I also considered "ShuttingDown",
but Stopping is simpler as it's a single word.
This commit is contained in:
Heikki Linnakangas
2022-11-29 22:12:36 +02:00
committed by Heikki Linnakangas
parent 9a6c0be823
commit 33834c01ec
6 changed files with 31 additions and 31 deletions

View File

@@ -23,7 +23,7 @@ pub enum TenantState {
Active,
/// A tenant is recognized by pageserver, but it is being detached or the
/// system is being shut down.
Paused,
Stopping,
/// A tenant is recognized by the pageserver, but can no longer be used for
/// any operations, because it failed to be activated.
Broken,
@@ -35,7 +35,7 @@ impl TenantState {
Self::Loading => true,
Self::Attaching => true,
Self::Active => false,
Self::Paused => false,
Self::Stopping => false,
Self::Broken => false,
}
}
@@ -53,7 +53,7 @@ pub enum TimelineState {
Suspended,
/// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
/// automatically become Active after certain events: only a management call can change this status.
Paused,
Stopping,
/// A timeline is recognized by the pageserver, but can no longer be used for
/// any operations, because it failed to be activated.
Broken,

View File

@@ -1248,7 +1248,7 @@ impl Tenant {
/// Removes timeline-related in-memory data
pub async fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> {
// Transition the timeline into TimelineState::Paused.
// Transition the timeline into TimelineState::Stopping.
// This should prevent new operations from starting.
let timeline = {
let mut timelines = self.timelines.lock().unwrap();
@@ -1269,14 +1269,14 @@ impl Tenant {
};
let timeline = Arc::clone(timeline_entry.get());
timeline.set_state(TimelineState::Paused);
timeline.set_state(TimelineState::Stopping);
drop(timelines);
timeline
};
info!("waiting for layer_removal_cs.lock()");
// No timeout here, GC & Compaction should be responsive to the `TimelineState::Paused` change.
// No timeout here, GC & Compaction should be responsive to the `TimelineState::Stopping` change.
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
info!("got layer_removal_cs.lock(), deleting layer files");
@@ -1301,7 +1301,7 @@ impl Tenant {
let children_exist = timelines
.iter()
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
// XXX this can happen because `branch_timeline` doesn't check `TimelineState::Paused`.
// XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
// We already deleted the layer files, so it's probably best to panic.
// (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
if children_exist {
@@ -1355,10 +1355,10 @@ impl Tenant {
"Could not activate tenant because it is in broken state"
));
}
TenantState::Paused => {
TenantState::Stopping => {
// The tenant was detached, or system shutdown was requested, while we were
// loading or attaching the tenant.
info!("Tenant is already in Paused state, skipping activation");
info!("Tenant is already in Stopping state, skipping activation");
}
TenantState::Loading | TenantState::Attaching => {
*current_state = TenantState::Active;
@@ -1384,16 +1384,16 @@ impl Tenant {
result
}
/// Change tenant status to paused, to mark that it is being shut down
pub fn set_paused(&self) {
/// Change tenant status to Stopping, to mark that it is being shut down
pub fn set_stopping(&self) {
self.state.send_modify(|current_state| {
match *current_state {
TenantState::Active | TenantState::Loading | TenantState::Attaching => {
*current_state = TenantState::Paused;
*current_state = TenantState::Stopping;
// FIXME: If the tenant is still Loading or Attaching, new timelines
// might be created after this. That's harmless, as the Timelines
// won't be accessible to anyone, when the Tenant is in Paused
// won't be accessible to anyone, when the Tenant is in Stopping
// state.
let timelines_accessor = self.timelines.lock().unwrap();
let not_broken_timelines = timelines_accessor
@@ -1404,12 +1404,12 @@ impl Tenant {
}
}
TenantState::Broken => {
info!("Cannot set tenant to Paused state, it is already in Broken state");
info!("Cannot set tenant to Stopping state, it is already in Broken state");
}
TenantState::Paused => {
TenantState::Stopping => {
// The tenant was detached, or system shutdown was requested, while we were
// loading or attaching the tenant.
info!("Tenant is already in Paused state");
info!("Tenant is already in Stopping state");
}
}
});
@@ -1430,10 +1430,10 @@ impl Tenant {
// This shouldn't happen either
warn!("Tenant is already broken");
}
TenantState::Paused => {
TenantState::Stopping => {
// This shouldn't happen either
*current_state = TenantState::Broken;
warn!("Marking Paused tenant as Broken");
warn!("Marking Stopping tenant as Broken");
}
TenantState::Loading | TenantState::Attaching => {
*current_state = TenantState::Broken;
@@ -1458,7 +1458,7 @@ impl Tenant {
TenantState::Active { .. } => {
return Ok(());
}
TenantState::Broken | TenantState::Paused => {
TenantState::Broken | TenantState::Stopping => {
// There's no chance the tenant can transition back into ::Active
anyhow::bail!(
"Tenant {} will not become active. Current state: {:?}",

View File

@@ -555,8 +555,8 @@ impl Timeline {
let _layer_removal_cs = self.layer_removal_cs.lock().await;
// Is the timeline being deleted?
let state = *self.state.borrow();
if state == TimelineState::Paused {
anyhow::bail!("timeline is paused: {:?}", state);
if state == TimelineState::Stopping {
anyhow::bail!("timeline is Stopping");
}
let target_file_size = self.get_checkpoint_distance();
@@ -668,8 +668,8 @@ impl Timeline {
(TimelineState::Broken, _) => {
error!("Ignoring state update {new_state:?} for broken tenant");
}
(TimelineState::Paused, TimelineState::Active) => {
debug!("Not activating a paused timeline");
(TimelineState::Stopping, TimelineState::Active) => {
debug!("Not activating a Stopping timeline");
}
(_, new_state) => {
self.state.send_replace(new_state);
@@ -1251,7 +1251,7 @@ impl Timeline {
match new_state {
// we're running this job for active timelines only
TimelineState::Active => continue,
TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return Some(new_state),
TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return Some(new_state),
}
}
Err(_sender_dropped_error) => return None,
@@ -2393,8 +2393,8 @@ impl Timeline {
let _layer_removal_cs = self.layer_removal_cs.lock().await;
// Is the timeline being deleted?
let state = *self.state.borrow();
if state == TimelineState::Paused {
anyhow::bail!("timeline is paused: {:?}", state);
if state == TimelineState::Stopping {
anyhow::bail!("timeline is Stopping");
}
let (horizon_cutoff, pitr_cutoff, retain_lsns) = {

View File

@@ -170,7 +170,7 @@ pub async fn shutdown_all_tenants() {
for (_, tenant) in m.drain() {
if tenant.is_active() {
// updates tenant state, forbidding new GC and compaction iterations from starting
tenant.set_paused();
tenant.set_stopping();
tenants_to_shut_down.push(tenant)
}
}
@@ -310,7 +310,7 @@ pub async fn detach_tenant(
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
};
tenant.set_paused();
tenant.set_stopping();
// shutdown all tenant and timeline tasks: gc, compaction, page service)
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;

View File

@@ -214,7 +214,7 @@ async fn connection_manager_loop_step(
match new_state {
// we're already active as walreceiver, no need to reactivate
TimelineState::Active => continue,
TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return ControlFlow::Continue(new_state),
TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return ControlFlow::Continue(new_state),
}
}
Err(_sender_dropped_error) => return ControlFlow::Break(()),

View File

@@ -1765,8 +1765,8 @@ class NeonPageserver(PgProtocol):
# FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946
".*could not flush frozen layer.*queue is in state Stopped", # when schedule layer upload fails because queued got closed before compaction got killed
".*wait for layer upload ops to complete.*", # .*Caused by:.*wait_completion aborted because upload queue was stopped
".*gc_loop.*Gc failed, retrying in.*timeline is paused: Paused", # When gc checks timeline state after acquiring layer_removal_cs
".*compaction_loop.*Compaction failed, retrying in.*timeline is paused: Paused", # When compaction checks timeline state after acquiring layer_removal_cs
".*gc_loop.*Gc failed, retrying in.*timeline is Stopping", # When gc checks timeline state after acquiring layer_removal_cs
".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs
".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
]