Files
neon/storage_controller/src/service/context_iterator.rs
John Spray fd1368d31e storcon: rework scheduler optimisation, prioritize AZ (#9916)
## Problem

We want to do a more robust job of scheduling tenants into their home
AZ: https://github.com/neondatabase/neon/issues/8264.

Closes:  https://github.com/neondatabase/neon/issues/8969

## Summary of changes

### Scope

This PR combines prioritizing AZ with a larger rework of how we do
optimisation. The rationale is that just bumping AZ in the order of
Score attributes is a very tiny change: the interesting part is lining
up all the optimisation logic to respect this properly, which means
rewriting it to use the same scores as the scheduler, rather than the
fragile hand-crafted logic that we had before. Separating these changes
out is possible, but would involve doing two rounds of test updates
instead of one.

### Scheduling optimisation

`TenantShard`'s `optimize_attachment` and `optimize_secondary` methods
now both use the scheduler to pick a new "favourite" location. Then
there is some refined logic for whether + how to migrate to it:
- To decide if a new location is sufficiently "better", we generate
scores using some projected ScheduleContexts that exclude the shard
under consideration, so that we avoid migrating from a node with
AffinityScore(2) to a node with AffinityScore(1), only to migrate back
later.
- Score types get a `for_optimization` method so that when we compare
scores, we will only do an optimisation if the scores differ by their
highest-ranking attributes, not just because one pageserver is lower in
utilization. Eventually we _will_ want a mode that does this, but doing
it here would make scheduling logic unstable and harder to test, and to
do this correctly one needs to know the size of the tenant that one is
migrating.
- When we find a new attached location that we would like to move to, we
will create a new secondary location there, even if we already had one
on some other node. This handles the case where we have a home AZ A, and
want to migrate the attachment between pageservers in that AZ while
retaining a secondary location in some other AZ as well.
- A unit test is added for
https://github.com/neondatabase/neon/issues/8969, which is implicitly
fixed by reworking optimisation to use the same scheduling scores as
scheduling.
2025-01-13 19:33:00 +00:00

137 lines
5.0 KiB
Rust

use std::collections::BTreeMap;
use utils::id::TenantId;
use utils::shard::TenantShardId;
use crate::scheduler::{ScheduleContext, ScheduleMode};
use crate::tenant_shard::TenantShard;
/// When making scheduling decisions, it is useful to have the ScheduleContext for a whole
/// tenant while considering the individual shards within it. This iterator is a helper
/// that gathers all the shards in a tenant and then yields them together with a ScheduleContext
/// for the tenant.
pub(super) struct TenantShardContextIterator<'a> {
schedule_mode: ScheduleMode,
inner: std::collections::btree_map::IterMut<'a, TenantShardId, TenantShard>,
}
impl<'a> TenantShardContextIterator<'a> {
pub(super) fn new(
tenants: &'a mut BTreeMap<TenantShardId, TenantShard>,
schedule_mode: ScheduleMode,
) -> Self {
Self {
schedule_mode,
inner: tenants.iter_mut(),
}
}
}
impl<'a> Iterator for TenantShardContextIterator<'a> {
type Item = (TenantId, ScheduleContext, Vec<&'a mut TenantShard>);
fn next(&mut self) -> Option<Self::Item> {
let mut tenant_shards = Vec::new();
let mut schedule_context = ScheduleContext::new(self.schedule_mode.clone());
loop {
let (tenant_shard_id, shard) = self.inner.next()?;
if tenant_shard_id.is_shard_zero() {
// Cleared on last shard of previous tenant
assert!(tenant_shards.is_empty());
}
// Accumulate the schedule context for all the shards in a tenant
schedule_context.avoid(&shard.intent.all_pageservers());
tenant_shards.push(shard);
if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 {
return Some((tenant_shard_id.tenant_id, schedule_context, tenant_shards));
}
}
}
}
#[cfg(test)]
mod tests {
use std::{collections::BTreeMap, str::FromStr};
use pageserver_api::controller_api::PlacementPolicy;
use utils::shard::{ShardCount, ShardNumber};
use crate::{
scheduler::test_utils::make_test_nodes, service::Scheduler,
tenant_shard::tests::make_test_tenant_with_id,
};
use super::*;
#[test]
fn test_context_iterator() {
// Hand-crafted tenant IDs to ensure they appear in the expected order when put into
// a btreemap & iterated
let mut t_1_shards = make_test_tenant_with_id(
TenantId::from_str("af0480929707ee75372337efaa5ecf96").unwrap(),
PlacementPolicy::Attached(1),
ShardCount(1),
None,
);
let t_2_shards = make_test_tenant_with_id(
TenantId::from_str("bf0480929707ee75372337efaa5ecf96").unwrap(),
PlacementPolicy::Attached(1),
ShardCount(4),
None,
);
let mut t_3_shards = make_test_tenant_with_id(
TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap(),
PlacementPolicy::Attached(1),
ShardCount(1),
None,
);
let t1_id = t_1_shards[0].tenant_shard_id.tenant_id;
let t2_id = t_2_shards[0].tenant_shard_id.tenant_id;
let t3_id = t_3_shards[0].tenant_shard_id.tenant_id;
let mut tenants = BTreeMap::new();
tenants.insert(t_1_shards[0].tenant_shard_id, t_1_shards.pop().unwrap());
for shard in t_2_shards {
tenants.insert(shard.tenant_shard_id, shard);
}
tenants.insert(t_3_shards[0].tenant_shard_id, t_3_shards.pop().unwrap());
let nodes = make_test_nodes(3, &[]);
let mut scheduler = Scheduler::new(nodes.values());
let mut context = ScheduleContext::default();
for shard in tenants.values_mut() {
shard.schedule(&mut scheduler, &mut context).unwrap();
}
let mut iter = TenantShardContextIterator::new(&mut tenants, ScheduleMode::Speculative);
let (tenant_id, context, shards) = iter.next().unwrap();
assert_eq!(tenant_id, t1_id);
assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
assert_eq!(shards.len(), 1);
assert_eq!(context.location_count(), 2);
let (tenant_id, context, shards) = iter.next().unwrap();
assert_eq!(tenant_id, t2_id);
assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
assert_eq!(shards[1].tenant_shard_id.shard_number, ShardNumber(1));
assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2));
assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3));
assert_eq!(shards.len(), 4);
assert_eq!(context.location_count(), 8);
let (tenant_id, context, shards) = iter.next().unwrap();
assert_eq!(tenant_id, t3_id);
assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
assert_eq!(shards.len(), 1);
assert_eq!(context.location_count(), 2);
for shard in tenants.values_mut() {
shard.intent.clear(&mut scheduler);
}
}
}