diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 0bfca5385e..99079c57b0 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1272,7 +1272,9 @@ impl TenantShard { } /// Return true if the optimization was really applied: it will not be applied if the optimization's - /// sequence is behind this tenant shard's + /// sequence is behind this tenant shard's or if the intent state proposed by the optimization + /// is not compatible with the current intent state. The later may happen when the background + /// reconcile loops runs concurrently with HTTP driven optimisations. pub(crate) fn apply_optimization( &mut self, scheduler: &mut Scheduler, @@ -1282,6 +1284,15 @@ impl TenantShard { return false; } + if !self.validate_optimization(&optimization) { + tracing::info!( + "Skipping optimization for {} because it does not match current intent: {:?}", + self.tenant_shard_id, + optimization, + ); + return false; + } + metrics::METRICS_REGISTRY .metrics_group .storage_controller_schedule_optimization @@ -1322,6 +1333,34 @@ impl TenantShard { true } + /// Check that the desired modifications to the intent state are compatible with + /// the current intent state + fn validate_optimization(&self, optimization: &ScheduleOptimization) -> bool { + match optimization.action { + ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id, + new_attached_node_id, + }) => { + self.intent.attached == Some(old_attached_node_id) + && self.intent.secondary.contains(&new_attached_node_id) + } + ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { + old_node_id: _, + new_node_id, + }) => { + // It's legal to remove a secondary that is not present in the intent state + !self.intent.secondary.contains(&new_node_id) + } + ScheduleOptimizationAction::CreateSecondary(new_node_id) => { + !self.intent.secondary.contains(&new_node_id) + } + ScheduleOptimizationAction::RemoveSecondary(_) => { + // It's legal to remove a secondary that is not present in the intent state + true + } + } + } + /// When a shard has several secondary locations, we need to pick one in situations where /// we promote one of them to an attached location: /// - When draining a node for restart diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py index 0bb210db23..1b77831b75 100644 --- a/test_runner/performance/test_sharding_autosplit.py +++ b/test_runner/performance/test_sharding_autosplit.py @@ -73,6 +73,11 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): ".*Local notification hook failed.*", ".*Marking shard.*for notification retry.*", ".*Failed to notify compute.*", + # As an optimization, the storage controller kicks the downloads on the secondary + # after the shard split. However, secondaries are created async, so it's possible + # that the intent state was modified, but the actual secondary hasn't been created, + # which results in an error. + ".*Error calling secondary download after shard split.*", ] )