Files
neon/storage_controller/src/service/context_iterator.rs
John Spray ea3798e3b3 storage controller: use proper ScheduleContext when evacuating a node (#9908)
## Problem

When picking locations for a shard, we should use a ScheduleContext that
includes all the other shards in the tenant, so that we apply proper
anti-affinity between shards. If we don't do this, then it can lead to
unstable scheduling, where we place a shard somewhere that the optimizer
will then immediately move it away from.

We didn't always do this, because it was a bit awkward to accumulate the
context for a tenant rather than just walking tenants.

This was a TODO in `handle_node_availability_transition`:
```
                        // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
                        // for tenants without secondary locations: if they have a secondary location, then this
                        // schedule() call is just promoting an existing secondary)
```

This is a precursor to https://github.com/neondatabase/neon/issues/8264,
where the current imperfect scheduling during node evacuation hampers
testing.

## Summary of changes

- Add an iterator type that yields each shard along with a
schedulecontext that includes all the other shards from the same tenant
- Use the iterator to replace hand-crafted logic in optimize_all_plan
(functionally identical)
- Use the iterator in `handle_node_availability_transition` to apply
proper anti-affinity during node evacuation.
2024-11-29 13:27:49 +00:00

140 lines
5.1 KiB
Rust

use std::collections::BTreeMap;
use utils::id::TenantId;
use utils::shard::TenantShardId;
use crate::scheduler::{ScheduleContext, ScheduleMode};
use crate::tenant_shard::TenantShard;
/// When making scheduling decisions, it is useful to have the ScheduleContext for a whole
/// tenant while considering the individual shards within it. This iterator is a helper
/// that gathers all the shards in a tenant and then yields them together with a ScheduleContext
/// for the tenant.
pub(super) struct TenantShardContextIterator<'a> {
schedule_mode: ScheduleMode,
inner: std::collections::btree_map::IterMut<'a, TenantShardId, TenantShard>,
}
impl<'a> TenantShardContextIterator<'a> {
pub(super) fn new(
tenants: &'a mut BTreeMap<TenantShardId, TenantShard>,
schedule_mode: ScheduleMode,
) -> Self {
Self {
schedule_mode,
inner: tenants.iter_mut(),
}
}
}
impl<'a> Iterator for TenantShardContextIterator<'a> {
type Item = (TenantId, ScheduleContext, Vec<&'a mut TenantShard>);
fn next(&mut self) -> Option<Self::Item> {
let mut tenant_shards = Vec::new();
let mut schedule_context = ScheduleContext::new(self.schedule_mode.clone());
loop {
let (tenant_shard_id, shard) = self.inner.next()?;
if tenant_shard_id.is_shard_zero() {
// Cleared on last shard of previous tenant
assert!(tenant_shards.is_empty());
}
// Accumulate the schedule context for all the shards in a tenant
schedule_context.avoid(&shard.intent.all_pageservers());
if let Some(attached) = shard.intent.get_attached() {
schedule_context.push_attached(*attached);
}
tenant_shards.push(shard);
if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 {
return Some((tenant_shard_id.tenant_id, schedule_context, tenant_shards));
}
}
}
}
#[cfg(test)]
mod tests {
use std::{collections::BTreeMap, str::FromStr};
use pageserver_api::controller_api::PlacementPolicy;
use utils::shard::{ShardCount, ShardNumber};
use crate::{
scheduler::test_utils::make_test_nodes, service::Scheduler,
tenant_shard::tests::make_test_tenant_with_id,
};
use super::*;
#[test]
fn test_context_iterator() {
// Hand-crafted tenant IDs to ensure they appear in the expected order when put into
// a btreemap & iterated
let mut t_1_shards = make_test_tenant_with_id(
TenantId::from_str("af0480929707ee75372337efaa5ecf96").unwrap(),
PlacementPolicy::Attached(1),
ShardCount(1),
None,
);
let t_2_shards = make_test_tenant_with_id(
TenantId::from_str("bf0480929707ee75372337efaa5ecf96").unwrap(),
PlacementPolicy::Attached(1),
ShardCount(4),
None,
);
let mut t_3_shards = make_test_tenant_with_id(
TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap(),
PlacementPolicy::Attached(1),
ShardCount(1),
None,
);
let t1_id = t_1_shards[0].tenant_shard_id.tenant_id;
let t2_id = t_2_shards[0].tenant_shard_id.tenant_id;
let t3_id = t_3_shards[0].tenant_shard_id.tenant_id;
let mut tenants = BTreeMap::new();
tenants.insert(t_1_shards[0].tenant_shard_id, t_1_shards.pop().unwrap());
for shard in t_2_shards {
tenants.insert(shard.tenant_shard_id, shard);
}
tenants.insert(t_3_shards[0].tenant_shard_id, t_3_shards.pop().unwrap());
let nodes = make_test_nodes(3, &[]);
let mut scheduler = Scheduler::new(nodes.values());
let mut context = ScheduleContext::default();
for shard in tenants.values_mut() {
shard.schedule(&mut scheduler, &mut context).unwrap();
}
let mut iter = TenantShardContextIterator::new(&mut tenants, ScheduleMode::Speculative);
let (tenant_id, context, shards) = iter.next().unwrap();
assert_eq!(tenant_id, t1_id);
assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
assert_eq!(shards.len(), 1);
assert_eq!(context.attach_count(), 1);
let (tenant_id, context, shards) = iter.next().unwrap();
assert_eq!(tenant_id, t2_id);
assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
assert_eq!(shards[1].tenant_shard_id.shard_number, ShardNumber(1));
assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2));
assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3));
assert_eq!(shards.len(), 4);
assert_eq!(context.attach_count(), 4);
let (tenant_id, context, shards) = iter.next().unwrap();
assert_eq!(tenant_id, t3_id);
assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
assert_eq!(shards.len(), 1);
assert_eq!(context.attach_count(), 1);
for shard in tenants.values_mut() {
shard.intent.clear(&mut scheduler);
}
}
}