mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 13:32:57 +00:00
storcon: update compute hook state on detach (#9045)
## Problem Previously, the storage controller may send compute notifications containing stale pageservers (i.e. pageserver serving the shard was detached). This happened because detaches did not update the compute hook state. ## Summary of Changes Update compute hook state on shard detach. Fixes #8928
This commit is contained in:
@@ -71,6 +71,37 @@ impl ComputeHookTenant {
|
||||
}
|
||||
}
|
||||
|
||||
fn is_sharded(&self) -> bool {
|
||||
matches!(self, ComputeHookTenant::Sharded(_))
|
||||
}
|
||||
|
||||
/// Clear compute hook state for the specified shard.
|
||||
/// Only valid for [`ComputeHookTenant::Sharded`] instances.
|
||||
fn remove_shard(&mut self, tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize) {
|
||||
match self {
|
||||
ComputeHookTenant::Sharded(sharded) => {
|
||||
if sharded.stripe_size != stripe_size
|
||||
|| sharded.shard_count != tenant_shard_id.shard_count
|
||||
{
|
||||
tracing::warn!("Shard split detected while handling detach")
|
||||
}
|
||||
|
||||
let shard_idx = sharded.shards.iter().position(|(shard_number, _node_id)| {
|
||||
*shard_number == tenant_shard_id.shard_number
|
||||
});
|
||||
|
||||
if let Some(shard_idx) = shard_idx {
|
||||
sharded.shards.remove(shard_idx);
|
||||
} else {
|
||||
tracing::warn!("Shard not found while handling detach")
|
||||
}
|
||||
}
|
||||
ComputeHookTenant::Unsharded(_) => {
|
||||
unreachable!("Detach of unsharded tenants is handled externally");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Set one shard's location. If stripe size or shard count have changed, Self is reset
|
||||
/// and drops existing content.
|
||||
fn update(
|
||||
@@ -614,6 +645,36 @@ impl ComputeHook {
|
||||
self.notify_execute(maybe_send_result, tenant_shard_id, cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Reflect a detach for a particular shard in the compute hook state.
|
||||
///
|
||||
/// The goal is to avoid sending compute notifications with stale information (i.e.
|
||||
/// including detach pageservers).
|
||||
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||
pub(super) fn handle_detach(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
stripe_size: ShardStripeSize,
|
||||
) {
|
||||
use std::collections::hash_map::Entry;
|
||||
|
||||
let mut state_locked = self.state.lock().unwrap();
|
||||
match state_locked.entry(tenant_shard_id.tenant_id) {
|
||||
Entry::Vacant(_) => {
|
||||
tracing::warn!("Compute hook tenant not found for detach");
|
||||
}
|
||||
Entry::Occupied(mut e) => {
|
||||
let sharded = e.get().is_sharded();
|
||||
if !sharded {
|
||||
e.remove();
|
||||
} else {
|
||||
e.get_mut().remove_shard(tenant_shard_id, stripe_size);
|
||||
}
|
||||
|
||||
tracing::debug!("Compute hook handled shard detach");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -820,6 +820,16 @@ impl Reconciler {
|
||||
self.location_config(&node, conf, None, false).await?;
|
||||
}
|
||||
|
||||
// The condition below identifies a detach. We must have no attached intent and
|
||||
// must have been attached to something previously. Pass this information to
|
||||
// the [`ComputeHook`] such that it can update its tenant-wide state.
|
||||
if self.intent.attached.is_none() && !self.detach.is_empty() {
|
||||
// TODO: Consider notifying control plane about detaches. This would avoid situations
|
||||
// where the compute tries to start-up with a stale set of pageservers.
|
||||
self.compute_hook
|
||||
.handle_detach(self.tenant_shard_id, self.shard.stripe_size);
|
||||
}
|
||||
|
||||
failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue");
|
||||
|
||||
Ok(())
|
||||
|
||||
Reference in New Issue
Block a user