From 5432155b0d161a332d6d8ec2933a875d9959e558 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Mon, 23 Sep 2024 10:05:02 +0100 Subject: [PATCH] storcon: update compute hook state on detach (#9045) ## Problem Previously, the storage controller may send compute notifications containing stale pageservers (i.e. pageserver serving the shard was detached). This happened because detaches did not update the compute hook state. ## Summary of Changes Update compute hook state on shard detach. Fixes #8928 --- storage_controller/src/compute_hook.rs | 61 ++++++++++++++++++++++++++ storage_controller/src/reconciler.rs | 10 +++++ 2 files changed, 71 insertions(+) diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index c46539485c..bafae1f551 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -71,6 +71,37 @@ impl ComputeHookTenant { } } + fn is_sharded(&self) -> bool { + matches!(self, ComputeHookTenant::Sharded(_)) + } + + /// Clear compute hook state for the specified shard. + /// Only valid for [`ComputeHookTenant::Sharded`] instances. + fn remove_shard(&mut self, tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize) { + match self { + ComputeHookTenant::Sharded(sharded) => { + if sharded.stripe_size != stripe_size + || sharded.shard_count != tenant_shard_id.shard_count + { + tracing::warn!("Shard split detected while handling detach") + } + + let shard_idx = sharded.shards.iter().position(|(shard_number, _node_id)| { + *shard_number == tenant_shard_id.shard_number + }); + + if let Some(shard_idx) = shard_idx { + sharded.shards.remove(shard_idx); + } else { + tracing::warn!("Shard not found while handling detach") + } + } + ComputeHookTenant::Unsharded(_) => { + unreachable!("Detach of unsharded tenants is handled externally"); + } + } + } + /// Set one shard's location. If stripe size or shard count have changed, Self is reset /// and drops existing content. fn update( @@ -614,6 +645,36 @@ impl ComputeHook { self.notify_execute(maybe_send_result, tenant_shard_id, cancel) .await } + + /// Reflect a detach for a particular shard in the compute hook state. + /// + /// The goal is to avoid sending compute notifications with stale information (i.e. + /// including detach pageservers). + #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] + pub(super) fn handle_detach( + &self, + tenant_shard_id: TenantShardId, + stripe_size: ShardStripeSize, + ) { + use std::collections::hash_map::Entry; + + let mut state_locked = self.state.lock().unwrap(); + match state_locked.entry(tenant_shard_id.tenant_id) { + Entry::Vacant(_) => { + tracing::warn!("Compute hook tenant not found for detach"); + } + Entry::Occupied(mut e) => { + let sharded = e.get().is_sharded(); + if !sharded { + e.remove(); + } else { + e.get_mut().remove_shard(tenant_shard_id, stripe_size); + } + + tracing::debug!("Compute hook handled shard detach"); + } + } + } } #[cfg(test)] diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 83b7b2b4f2..750bcd7c01 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -820,6 +820,16 @@ impl Reconciler { self.location_config(&node, conf, None, false).await?; } + // The condition below identifies a detach. We must have no attached intent and + // must have been attached to something previously. Pass this information to + // the [`ComputeHook`] such that it can update its tenant-wide state. + if self.intent.attached.is_none() && !self.detach.is_empty() { + // TODO: Consider notifying control plane about detaches. This would avoid situations + // where the compute tries to start-up with a stale set of pageservers. + self.compute_hook + .handle_detach(self.tenant_shard_id, self.shard.stripe_size); + } + failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue"); Ok(())