storcon: validate intent state before applying optimization (#12593)

## Problem

In the gap between picking an optimization and applying it, something
might insert a change to the intent state that makes it incompatible.
If the change is done via the `schedule()` method, we are covered by the
increased sequence number, but otherwise we can panic if we violate the
intent state invariants.

## Summary of Changes

Validate the optimization right before applying it. Since we hold the
service lock at that point, nothing else can sneak in.

Closes LKB-65
This commit is contained in:
Vlad Lazar
2025-07-16 15:37:40 +01:00
committed by GitHub
parent c71aea0223
commit 3e4cbaed67
2 changed files with 45 additions and 1 deletions

View File

@@ -1272,7 +1272,9 @@ impl TenantShard {
}
/// Return true if the optimization was really applied: it will not be applied if the optimization's
/// sequence is behind this tenant shard's
/// sequence is behind this tenant shard's or if the intent state proposed by the optimization
/// is not compatible with the current intent state. The later may happen when the background
/// reconcile loops runs concurrently with HTTP driven optimisations.
pub(crate) fn apply_optimization(
&mut self,
scheduler: &mut Scheduler,
@@ -1282,6 +1284,15 @@ impl TenantShard {
return false;
}
if !self.validate_optimization(&optimization) {
tracing::info!(
"Skipping optimization for {} because it does not match current intent: {:?}",
self.tenant_shard_id,
optimization,
);
return false;
}
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_schedule_optimization
@@ -1322,6 +1333,34 @@ impl TenantShard {
true
}
/// Check that the desired modifications to the intent state are compatible with
/// the current intent state
fn validate_optimization(&self, optimization: &ScheduleOptimization) -> bool {
match optimization.action {
ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
old_attached_node_id,
new_attached_node_id,
}) => {
self.intent.attached == Some(old_attached_node_id)
&& self.intent.secondary.contains(&new_attached_node_id)
}
ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
old_node_id: _,
new_node_id,
}) => {
// It's legal to remove a secondary that is not present in the intent state
!self.intent.secondary.contains(&new_node_id)
}
ScheduleOptimizationAction::CreateSecondary(new_node_id) => {
!self.intent.secondary.contains(&new_node_id)
}
ScheduleOptimizationAction::RemoveSecondary(_) => {
// It's legal to remove a secondary that is not present in the intent state
true
}
}
}
/// When a shard has several secondary locations, we need to pick one in situations where
/// we promote one of them to an attached location:
/// - When draining a node for restart