storcon: refine chaos selection logic (#10600)

## Problem

In https://github.com/neondatabase/neon/pull/10438 it was pointed out
that it would be good to avoid picking tenants in ID order, and also to
avoid situations where we might double-select the same tenant.

There was an initial swing at this in
https://github.com/neondatabase/neon/pull/10443, where Chi suggested a
simpler approach which is done in this PR

## Summary of changes

- Split total set of tenants into in and out of home AZ
- Consume out of home AZ first, and if necessary shuffle + consume from
out of home AZ
This commit is contained in:
John Spray
2025-01-30 22:45:43 +00:00
committed by GitHub
parent e1273acdb1
commit 5e0c40709f

View File

@@ -96,29 +96,38 @@ impl ChaosInjector {
let batch_size = 128;
let mut inner = self.service.inner.write().unwrap();
let (nodes, tenants, scheduler) = inner.parts_mut();
let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
// Prefer to migrate tenants that are currently outside their home AZ. This avoids the chaos injector
// continuously pushing tenants outside their home AZ: instead, we'll tend to cycle between picking some
// random tenants to move, and then on next chaos iteration moving them back, then picking some new
// random tenants on the next iteration.
let (out_of_home_az, in_home_az): (Vec<_>, Vec<_>) = tenants
.values()
.map(|shard| {
(
shard.tenant_shard_id,
shard.is_attached_outside_preferred_az(nodes),
)
})
.partition(|(_id, is_outside)| *is_outside);
let mut out_of_home_az: Vec<_> = out_of_home_az.into_iter().map(|(id, _)| id).collect();
let mut in_home_az: Vec<_> = in_home_az.into_iter().map(|(id, _)| id).collect();
let mut victims = Vec::with_capacity(batch_size);
for shard in tenants.values() {
if shard.is_attached_outside_preferred_az(nodes) {
victims.push(shard.tenant_shard_id);
}
if out_of_home_az.len() >= batch_size {
tracing::info!("Injecting chaos: found {batch_size} shards to migrate back to home AZ (total {} out of home AZ)", out_of_home_az.len());
if victims.len() >= batch_size {
break;
}
out_of_home_az.shuffle(&mut thread_rng());
victims.extend(out_of_home_az.into_iter().take(batch_size));
} else {
tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {} random shards to migrate", out_of_home_az.len(), std::cmp::min(batch_size - out_of_home_az.len(), in_home_az.len()));
victims.extend(out_of_home_az);
in_home_az.shuffle(&mut thread_rng());
victims.extend(in_home_az.into_iter().take(batch_size - victims.len()));
}
let choose_random = batch_size.saturating_sub(victims.len());
tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {choose_random} random shards to migrate", victims.len());
let random_victims = tenant_ids.choose_multiple(&mut thread_rng(), choose_random);
victims.extend(random_victims);
for victim in victims {
self.maybe_migrate_to_secondary(victim, nodes, tenants, scheduler);
}