From 6b6570b580d3807846ddb85635ac2354a1e462d5 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 18 Jan 2023 17:25:51 +0100 Subject: [PATCH] remove TimelineState::Suspended, introduce TimelineState::Loading The TimelineState::Suspsended was dubious to begin with. I suppose that the intention was that timelines could transition back and forth between Active and Suspended states. But practically, the code before this patch never did that. The transitions were: () ==Timeline::new==> Suspended ==*==> {Active,Broken,Stopping} One exception: Tenant::set_stopping() could transition timelines like so: !Broken ==Tenant::set_stopping()==> Suspended But Tenant itself cannot transition from stopping state to any other state. Thus, this patch removes TimelineState::Suspended and introduces a new state Loading. The aforementioned transitions change as follows: - () ==Timeline::new==> Suspended ==*==> {Active,Broken,Stopping} + () ==Timeline::new==> Loading ==*==> {Active,Broken,Stopping} - !Broken ==Tenant::set_stopping()==> Suspended + !Broken ==Tenant::set_stopping()==> Stopping Walreceiver's connection manager loop watches TimelineState to decide whether it should retry connecting, or exit. This patch changes the loop to exit when it observes the transition into Stopping state. Walreceiver isn't supposed to be started until the timeline transitions into Active state. So, this patch also adds some warn!() messages in case this happens anyways. --- libs/pageserver_api/src/models.rs | 19 +++++++++---------- pageserver/src/tenant.rs | 2 +- pageserver/src/tenant/timeline.rs | 4 ++-- .../src/walreceiver/connection_manager.rs | 14 ++++++++++++-- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index b5027cb331..d81e05aaa2 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -44,18 +44,17 @@ impl TenantState { /// A state of a timeline in pageserver's memory. #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum TimelineState { - /// Timeline is fully operational. If the containing Tenant is Active, the timeline's - /// background jobs are running otherwise they will be launched when the tenant is activated. + /// The timeline is recognized by the pageserver but is not yet operational. + /// In particular, the walreceiver connection loop is not running for this timeline. + /// It will eventually transition to state Active or Broken. + Loading, + /// The timeline is fully operational. + /// It can be queried, and the walreceiver connection loop is running. Active, - /// A timeline is recognized by pageserver, but not yet ready to operate. - /// The status indicates, that the timeline could eventually go back to Active automatically: - /// for example, if the owning tenant goes back to Active again. - Suspended, - /// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to - /// automatically become Active after certain events: only a management call can change this status. + /// The timeline was previously Loading or Active but is shutting down. + /// It cannot transition back into any other state. Stopping, - /// A timeline is recognized by the pageserver, but can no longer be used for - /// any operations, because it failed to be activated. + /// The timeline is broken and not operational (previous states: Loading or Active). Broken, } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c18c645e5b..6c49393969 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1487,7 +1487,7 @@ impl Tenant { .values() .filter(|timeline| timeline.current_state() != TimelineState::Broken); for timeline in not_broken_timelines { - timeline.set_state(TimelineState::Suspended); + timeline.set_state(TimelineState::Stopping); } } TenantState::Broken => { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 5b84df74d4..860ee45397 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -812,7 +812,7 @@ impl Timeline { pg_version: u32, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); - let (state, _) = watch::channel(TimelineState::Suspended); + let (state, _) = watch::channel(TimelineState::Loading); let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0); let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); @@ -1400,7 +1400,7 @@ impl Timeline { TimelineState::Active => continue, TimelineState::Broken | TimelineState::Stopping - | TimelineState::Suspended => { + | TimelineState::Loading => { break format!("aborted because timeline became inactive (new state: {new_state:?})") } } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 8b60e59305..39668ef910 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -183,13 +183,23 @@ async fn connection_manager_loop_step( new_event = async { loop { + if walreceiver_state.timeline.current_state() == TimelineState::Loading { + warn!("wal connection manager should only be launched after timeline has become active"); + } match timeline_state_updates.changed().await { Ok(()) => { let new_state = walreceiver_state.timeline.current_state(); match new_state { // we're already active as walreceiver, no need to reactivate TimelineState::Active => continue, - TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return ControlFlow::Continue(new_state), + TimelineState::Broken | TimelineState::Stopping => { + info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop"); + return ControlFlow::Break(()); + } + TimelineState::Loading => { + warn!("timeline transitioned back to Loading state, that should not happen"); + return ControlFlow::Continue(new_state); + } } } Err(_sender_dropped_error) => return ControlFlow::Break(()), @@ -197,7 +207,7 @@ async fn connection_manager_loop_step( } } => match new_event { ControlFlow::Continue(new_state) => { - info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates"); + info!("observed timeline state change, new state is {new_state:?}"); return ControlFlow::Continue(()); } ControlFlow::Break(()) => {