From 94b41b531b31dec2d065d1c892748f4ea93fc384 Mon Sep 17 00:00:00 2001 From: Paul Banks Date: Thu, 24 Jul 2025 17:14:47 +0100 Subject: [PATCH] storecon: Fix panic due to race with chaos migration on staging (#12727) ## Problem * Fixes LKB-743 We get regular assertion failures on staging caused by a race with chaos injector. If chaos injector decides to migrate a tenant shard between the background optimisation planning and applying optimisations then we attempt to migrate and already migrated shard and hit an assertion failure. ## Summary of changes @VladLazar fixed a variant of this issue by adding`validate_optimization` recently, however it didn't validate the specific property this other assertion requires. Fix is just to update it to cover all the expected properties. --- storage_controller/src/tenant_shard.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index f60378470e..3eb54d714d 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -249,6 +249,10 @@ impl IntentState { } pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) { + // Every assertion here should probably have a corresponding check in + // `validate_optimization` unless it is an invariant that should never be violated. Note + // that the lock is not held between planning optimizations and applying them so you have to + // assume any valid state transition of the intent state may have occurred assert!(!self.secondary.contains(&new_secondary)); assert!(self.attached != Some(new_secondary)); scheduler.update_node_ref_counts( @@ -1335,8 +1339,9 @@ impl TenantShard { true } - /// Check that the desired modifications to the intent state are compatible with - /// the current intent state + /// Check that the desired modifications to the intent state are compatible with the current + /// intent state. Note that the lock is not held between planning optimizations and applying + /// them so any valid state transition of the intent state may have occurred. fn validate_optimization(&self, optimization: &ScheduleOptimization) -> bool { match optimization.action { ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { @@ -1352,6 +1357,9 @@ impl TenantShard { }) => { // It's legal to remove a secondary that is not present in the intent state !self.intent.secondary.contains(&new_node_id) + // Ensure the secondary hasn't already been promoted to attached by a concurrent + // optimization/migration. + && self.intent.attached != Some(new_node_id) } ScheduleOptimizationAction::CreateSecondary(new_node_id) => { !self.intent.secondary.contains(&new_node_id)