From 516ac0591e762142ca0ce85f212192c5af59a097 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 2 Sep 2024 12:36:57 +0100 Subject: [PATCH] storage controller: eliminate ensure_attached (#8875) ## Problem This is a followup to #8783 - The old blocking ensure_attached function had been retained to handle the case where a shard had a None generation_pageserver, but this wasn't really necessary. - There was a subtle `.1` in the code where a struct would have been clearer Closes #8819 ## Summary of changes - Add ShardGenerationState to represent the results of peek_generation - Instead of calling ensure_attached when a tenant has a non-attached shard, check the shard's policy and return 409 if it isn't Attached, else return 503 if the shard's policy is attached but it hasn't been reconciled yet (i.e. has a None generation_pageserver) --- storage_controller/src/persistence.rs | 22 +++-- storage_controller/src/service.rs | 124 +++++++++----------------- 2 files changed, 58 insertions(+), 88 deletions(-) diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index a842079ce7..6e1c2016ff 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -122,6 +122,13 @@ pub(crate) enum TenantFilter { Shard(TenantShardId), } +/// Represents the results of looking up generation+pageserver for the shards of a tenant +pub(crate) struct ShardGenerationState { + pub(crate) tenant_shard_id: TenantShardId, + pub(crate) generation: Option, + pub(crate) generation_pageserver: Option, +} + impl Persistence { // The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under // normal circumstances. This assumes we have exclusive use of the database cluster to which we connect. @@ -540,7 +547,7 @@ impl Persistence { pub(crate) async fn peek_generations( &self, filter_tenant_id: TenantId, - ) -> Result, Option)>, DatabaseError> { + ) -> Result, DatabaseError> { use crate::schema::tenant_shards::dsl::*; let rows = self .with_measured_conn(DatabaseOperation::PeekGenerations, move |conn| { @@ -555,13 +562,12 @@ impl Persistence { Ok(rows .into_iter() - .map(|p| { - ( - p.get_tenant_shard_id() - .expect("Corrupt tenant shard id in database"), - p.generation.map(|g| Generation::new(g as u32)), - p.generation_pageserver.map(|n| NodeId(n as u64)), - ) + .map(|p| ShardGenerationState { + tenant_shard_id: p + .get_tenant_shard_id() + .expect("Corrupt tenant shard id in database"), + generation: p.generation.map(|g| Generation::new(g as u32)), + generation_pageserver: p.generation_pageserver.map(|n| NodeId(n as u64)), }) .collect()) } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 1f221a9b45..78627953d0 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -22,7 +22,7 @@ use crate::{ peer_client::GlobalObservedState, persistence::{ AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence, - TenantFilter, + ShardGenerationState, TenantFilter, }, reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder}, scheduler::{MaySchedule, ScheduleContext, ScheduleMode}, @@ -3106,20 +3106,44 @@ impl Service { // will still be the latest when we're done: we will check generations again at the end of // this function to handle that. let generations = self.persistence.peek_generations(tenant_id).await?; - let generations = if generations.iter().any(|i| i.1.is_none()) { - // One or more shards is not attached to anything: maybe this is a new tenant? Wait for - // it to reconcile. - self.ensure_attached_wait(tenant_id).await?; - self.persistence.peek_generations(tenant_id).await? - } else { - generations - }; + + if generations + .iter() + .any(|i| i.generation.is_none() || i.generation_pageserver.is_none()) + { + // One or more shards has not been attached to a pageserver. Check if this is because it's configured + // to be detached (409: caller should give up), or because it's meant to be attached but isn't yet (503: caller should retry) + let locked = self.inner.read().unwrap(); + for (shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + match shard.policy { + PlacementPolicy::Attached(_) => { + // This shard is meant to be attached: the caller is not wrong to try and + // use this function, but we can't service the request right now. + } + PlacementPolicy::Secondary | PlacementPolicy::Detached => { + return Err(ApiError::Conflict(format!( + "Shard {shard_id} tenant has policy {:?}", + shard.policy + ))); + } + } + } + + return Err(ApiError::ResourceUnavailable( + "One or more shards in tenant is not yet attached".into(), + )); + } let locked = self.inner.read().unwrap(); - for (tenant_shard_id, generation, generation_pageserver) in generations { - let node_id = generation_pageserver.ok_or(ApiError::Conflict( - "Tenant not currently attached".to_string(), - ))?; + for ShardGenerationState { + tenant_shard_id, + generation, + generation_pageserver, + } in generations + { + let node_id = generation_pageserver.expect("We checked for None above"); let node = locked .nodes .get(&node_id) @@ -3141,7 +3165,13 @@ impl Service { let latest_generations = self.persistence.peek_generations(tenant_id).await?; if latest_generations .into_iter() - .map(|g| (g.0, g.1)) + .map( + |ShardGenerationState { + tenant_shard_id, + generation, + generation_pageserver: _, + }| (tenant_shard_id, generation), + ) .collect::>() != target_gens .into_iter() @@ -5280,72 +5310,6 @@ impl Service { )) } - /// Helper for methods that will try and call pageserver APIs for - /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant - /// is attached somewhere. - fn ensure_attached_schedule( - &self, - mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>, - tenant_id: TenantId, - ) -> Result, anyhow::Error> { - let mut waiters = Vec::new(); - let (nodes, tenants, scheduler) = locked.parts_mut(); - - let mut schedule_context = ScheduleContext::default(); - for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { - shard.schedule(scheduler, &mut schedule_context)?; - - // The shard's policies may not result in an attached location being scheduled: this - // is an error because our caller needs it attached somewhere. - if shard.intent.get_attached().is_none() { - return Err(anyhow::anyhow!( - "Tenant {tenant_id} not scheduled to be attached" - )); - }; - - if shard.stably_attached().is_some() { - // We do not require the shard to be totally up to date on reconciliation: we just require - // that it has been attached on the intended node. Other dirty state such as unattached secondary - // locations, or compute hook notifications can be ignored. - continue; - } - - if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { - tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached"); - waiters.push(waiter); - } - } - Ok(waiters) - } - - async fn ensure_attached_wait(&self, tenant_id: TenantId) -> Result<(), ApiError> { - let ensure_waiters = { - let locked = self.inner.write().unwrap(); - - // Check if the tenant is splitting: in this case, even if it is attached, - // we must act as if it is not: this blocks e.g. timeline creation/deletion - // operations during the split. - for (_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) { - if !matches!(shard.splitting, SplitState::Idle) { - return Err(ApiError::ResourceUnavailable( - "Tenant shards are currently splitting".into(), - )); - } - } - - self.ensure_attached_schedule(locked, tenant_id) - .map_err(ApiError::InternalServerError)? - }; - - let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap(); - for waiter in ensure_waiters { - let timeout = deadline.duration_since(Instant::now()); - waiter.wait_timeout(timeout).await?; - } - - Ok(()) - } - /// Like [`Self::maybe_configured_reconcile_shard`], but uses the default reconciler /// configuration fn maybe_reconcile_shard(