From 13d3f4c29f5627d0b8aa234544bb73847cae909c Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 26 May 2023 17:13:53 +0200 Subject: [PATCH] set_stopping(): report in result if not transitioning to Stopping --- pageserver/src/tenant.rs | 44 +++++++++++++++++++++++++----------- pageserver/src/tenant/mgr.rs | 28 +++++++++++++++-------- 2 files changed, 49 insertions(+), 23 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index f03443b6a7..65ad8ae8e3 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -447,6 +447,11 @@ pub enum DeleteTimelineError { Other(#[from] anyhow::Error), } +pub enum SetStoppingError { + AlreadyStopping, + Broken, +} + struct RemoteStartupData { index_part: IndexPart, remote_metadata: TimelineMetadata, @@ -1711,7 +1716,7 @@ impl Tenant { /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state. /// /// This function is not cancel-safe! - pub async fn set_stopping(&self) { + pub async fn set_stopping(&self) -> Result<(), SetStoppingError> { let mut rx = self.state.subscribe(); // cannot stop before we're done activating, so wait out until we're done activating @@ -1729,8 +1734,8 @@ impl Tenant { .expect("cannot drop self.state while on a &self method"); // we now know we're done activating, let's see whether this task is the winner to transition into Stopping - let mut stopping = false; - self.state.send_modify(|current_state| match current_state { + let mut err = None; + let stopping = self.state.send_if_modified(|current_state| match current_state { TenantState::Activating | TenantState::Loading | TenantState::Attaching => { unreachable!("we ensured above that we're done with activation, and, there is no re-activation") } @@ -1739,29 +1744,42 @@ impl Tenant { // are created after the transition to Stopping. That's harmless, as the Timelines // won't be accessible to anyone afterwards, because the Tenant is in Stopping state. *current_state = TenantState::Stopping; - stopping = true; // Continue stopping outside the closure. We need to grab timelines.lock() // and we plan to turn it into a tokio::sync::Mutex in a future patch. + true } TenantState::Broken { reason, .. } => { info!( "Cannot set tenant to Stopping state, it is in Broken state due to: {reason}" ); + err = Some(SetStoppingError::Broken); + false } TenantState::Stopping => { info!("Tenant is already in Stopping state"); + err = Some(SetStoppingError::AlreadyStopping); + false } }); - - if stopping { - let timelines_accessor = self.timelines.lock().unwrap(); - let not_broken_timelines = timelines_accessor - .values() - .filter(|timeline| timeline.current_state() != TimelineState::Broken); - for timeline in not_broken_timelines { - timeline.set_state(TimelineState::Stopping); - } + match (stopping, err) { + (true, None) => {} // continue + (false, Some(err)) => return Err(err), + (true, Some(_)) => unreachable!( + "send_if_modified closure must error out if not transitioning to Stopping" + ), + (false, None) => unreachable!( + "send_if_modified closure must return true if transitioning to Stopping" + ), } + + let timelines_accessor = self.timelines.lock().unwrap(); + let not_broken_timelines = timelines_accessor + .values() + .filter(|timeline| timeline.current_state() != TimelineState::Broken); + for timeline in not_broken_timelines { + timeline.set_state(TimelineState::Stopping); + } + Ok(()) } /// Method for tenant::mgr to transition us into Broken state in case of a late failure in diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index d40468a5b3..185b5eb3c8 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -19,7 +19,9 @@ use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::TenantConfOpt; -use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState}; +use crate::tenant::{ + create_tenant_files, CreateTenantFilesMode, SetStoppingError, Tenant, TenantState, +}; use crate::IGNORED_TENANT_FILE_NAME; use utils::fs_ext::PathExt; @@ -247,7 +249,7 @@ pub async fn shutdown_all_tenants() { let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len()); for (_, tenant) in tenants_to_shut_down { // updates tenant state, forbidding new GC and compaction iterations from starting - tenant.set_stopping().await; + let _ = tenant.set_stopping().await; // TODO handle error tenants_to_freeze_and_flush.push(tenant); } @@ -587,14 +589,20 @@ where { let tenants_accessor = TENANTS.write().await; match tenants_accessor.get(&tenant_id) { - Some(tenant) => match tenant.current_state() { - TenantState::Attaching - | TenantState::Loading - | TenantState::Activating - | TenantState::Broken { .. } - | TenantState::Active => tenant.set_stopping().await, - TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)), - }, + Some(tenant) => { + match tenant.set_stopping().await { + Ok(()) => { + // we won, continue stopping procedure + } + Err(SetStoppingError::Broken) => { + // continue the procedure, let's hope the closure can deal with broken tenants + } + Err(SetStoppingError::AlreadyStopping) => { + // the tenant is already stopping or broken, don't do anything + return Err(TenantStateError::IsStopping(tenant_id)); + } + } + } None => return Err(TenantStateError::NotFound(tenant_id)), } }