mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-28 18:40:38 +00:00
storage controller: fix a disagreement between schedule + optimize
Closes: https://github.com/neondatabase/neon/issues/8969
This commit is contained in:
@@ -206,6 +206,10 @@ pub(crate) struct NodeSecondarySchedulingScore {
|
||||
/// The number of shards belonging to the tenant currently being
|
||||
/// scheduled that are attached to this node.
|
||||
affinity_score: AffinityScore,
|
||||
/// Size of [`ScheduleContext::attached_nodes`] for the current node.
|
||||
/// This normally tracks the number of attached shards belonging to the
|
||||
/// tenant being scheduled that are already on this node.
|
||||
secondary_shards_in_context: usize,
|
||||
/// Utilisation score that combines shard count and disk utilisation
|
||||
utilization_score: u64,
|
||||
/// Total number of shards attached to this node. When nodes have identical utilisation, this
|
||||
@@ -231,6 +235,7 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore {
|
||||
|
||||
Some(Self {
|
||||
az_match: SecondaryAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
|
||||
secondary_shards_in_context: context.secondary_nodes.get(node_id).copied().unwrap_or(0),
|
||||
affinity_score: context
|
||||
.nodes
|
||||
.get(node_id)
|
||||
@@ -327,6 +332,9 @@ pub(crate) struct ScheduleContext {
|
||||
/// Specifically how many _attached_ locations are on each node
|
||||
pub(crate) attached_nodes: HashMap<NodeId, usize>,
|
||||
|
||||
/// Specifically how many _secondary_ locations are on each node
|
||||
pub(crate) secondary_nodes: HashMap<NodeId, usize>,
|
||||
|
||||
pub(crate) mode: ScheduleMode,
|
||||
}
|
||||
|
||||
@@ -345,6 +353,11 @@ impl ScheduleContext {
|
||||
*entry += 1;
|
||||
}
|
||||
|
||||
pub(crate) fn push_secondary(&mut self, node_id: NodeId) {
|
||||
let entry = self.secondary_nodes.entry(node_id).or_default();
|
||||
*entry += 1;
|
||||
}
|
||||
|
||||
pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
|
||||
self.nodes
|
||||
.get(&node_id)
|
||||
@@ -786,7 +799,14 @@ pub(crate) mod test_utils {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
|
||||
use pageserver_api::{
|
||||
controller_api::NodeAvailability, models::utilization::test_utilization,
|
||||
shard::ShardIdentity, shard::TenantShardId,
|
||||
};
|
||||
use utils::{
|
||||
id::TenantId,
|
||||
shard::{ShardCount, ShardNumber},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -1074,4 +1094,171 @@ mod tests {
|
||||
intent.clear(&mut scheduler);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn repro_foo() {
|
||||
let az_tag = AvailabilityZone("az-a".to_string());
|
||||
|
||||
let nodes = test_utils::make_test_nodes(
|
||||
5,
|
||||
&[
|
||||
az_tag.clone(),
|
||||
az_tag.clone(),
|
||||
az_tag.clone(),
|
||||
az_tag.clone(),
|
||||
az_tag.clone(),
|
||||
],
|
||||
);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
// Need to keep these alive because they contribute to shard counts via RAII
|
||||
let mut scheduled_shards = Vec::new();
|
||||
|
||||
let mut context = ScheduleContext::default();
|
||||
|
||||
fn schedule_shard(
|
||||
tenant_shard_id: TenantShardId,
|
||||
expect_attached: NodeId,
|
||||
expect_secondary: NodeId,
|
||||
scheduled_shards: &mut Vec<TenantShard>,
|
||||
scheduler: &mut Scheduler,
|
||||
context: &mut ScheduleContext,
|
||||
) {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
tenant_shard_id.shard_number,
|
||||
tenant_shard_id.shard_count,
|
||||
pageserver_api::shard::ShardStripeSize(1),
|
||||
)
|
||||
.unwrap();
|
||||
let mut shard = TenantShard::new(
|
||||
tenant_shard_id,
|
||||
shard_identity,
|
||||
pageserver_api::controller_api::PlacementPolicy::Attached(1),
|
||||
);
|
||||
|
||||
shard.schedule(scheduler, context).unwrap();
|
||||
|
||||
assert_eq!(shard.intent.get_attached().unwrap(), expect_attached);
|
||||
assert_eq!(
|
||||
shard.intent.get_secondary().first().unwrap(),
|
||||
&expect_secondary
|
||||
);
|
||||
|
||||
scheduled_shards.push(shard);
|
||||
}
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(1),
|
||||
NodeId(2),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(1),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(3),
|
||||
NodeId(4),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(2),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(5),
|
||||
NodeId(1),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(3),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(2),
|
||||
NodeId(3),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(4),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(4),
|
||||
NodeId(5),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(5),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(1),
|
||||
NodeId(2),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(6),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(3),
|
||||
NodeId(4),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
schedule_shard(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(7),
|
||||
shard_count: ShardCount(8),
|
||||
},
|
||||
NodeId(5),
|
||||
NodeId(1),
|
||||
&mut scheduled_shards,
|
||||
&mut scheduler,
|
||||
&mut context,
|
||||
);
|
||||
|
||||
for shard in &scheduled_shards {
|
||||
assert_eq!(shard.optimize_attachment(&nodes, &context), None);
|
||||
}
|
||||
|
||||
for mut shard in scheduled_shards {
|
||||
shard.intent.clear(&mut scheduler);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5858,10 +5858,7 @@ impl Service {
|
||||
|
||||
// Accumulate the schedule context for all the shards in a tenant: we must have
|
||||
// the total view of all shards before we can try to optimize any of them.
|
||||
schedule_context.avoid(&shard.intent.all_pageservers());
|
||||
if let Some(attached) = shard.intent.get_attached() {
|
||||
schedule_context.push_attached(*attached);
|
||||
}
|
||||
shard.populate_context(&mut schedule_context);
|
||||
tenant_shards.push(shard);
|
||||
|
||||
// Once we have seen the last shard in the tenant, proceed to search across all shards
|
||||
|
||||
@@ -566,10 +566,7 @@ impl TenantShard {
|
||||
) -> Result<(), ScheduleError> {
|
||||
let r = self.do_schedule(scheduler, context);
|
||||
|
||||
context.avoid(&self.intent.all_pageservers());
|
||||
if let Some(attached) = self.intent.get_attached() {
|
||||
context.push_attached(*attached);
|
||||
}
|
||||
self.populate_context(context);
|
||||
|
||||
r
|
||||
}
|
||||
@@ -676,6 +673,19 @@ impl TenantShard {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// When building the ScheduleContext of a tenant, call this on each shard to
|
||||
/// add its contribution to the context.
|
||||
pub(crate) fn populate_context(&self, context: &mut ScheduleContext) {
|
||||
context.avoid(&self.intent.all_pageservers());
|
||||
|
||||
if let Some(attached) = self.intent.get_attached() {
|
||||
context.push_attached(*attached);
|
||||
}
|
||||
for secondary in self.intent.get_secondary() {
|
||||
context.push_secondary(*secondary);
|
||||
}
|
||||
}
|
||||
|
||||
/// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error
|
||||
/// if the swap is not possible and leaves the intent state in its original state.
|
||||
///
|
||||
@@ -1632,10 +1642,8 @@ pub(crate) mod tests {
|
||||
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
schedule_context.avoid(&shard_a.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
|
||||
schedule_context.avoid(&shard_b.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
|
||||
shard_a.populate_context(&mut schedule_context);
|
||||
shard_b.populate_context(&mut schedule_context);
|
||||
|
||||
let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
|
||||
|
||||
@@ -1699,10 +1707,8 @@ pub(crate) mod tests {
|
||||
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
schedule_context.avoid(&shard_a.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
|
||||
schedule_context.avoid(&shard_b.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
|
||||
shard_a.populate_context(&mut schedule_context);
|
||||
shard_b.populate_context(&mut schedule_context);
|
||||
|
||||
let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context);
|
||||
|
||||
@@ -1744,10 +1750,7 @@ pub(crate) mod tests {
|
||||
let mut any_changed = false;
|
||||
|
||||
for shard in shards.iter() {
|
||||
schedule_context.avoid(&shard.intent.all_pageservers());
|
||||
if let Some(attached) = shard.intent.get_attached() {
|
||||
schedule_context.push_attached(*attached);
|
||||
}
|
||||
shard.populate_context(&mut schedule_context);
|
||||
}
|
||||
|
||||
for shard in shards.iter_mut() {
|
||||
|
||||
Reference in New Issue
Block a user