mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-13 08:22:55 +00:00
Updates storage components to edition 2024. We like to stay on the latest edition if possible. There is no functional changes, however some code changes had to be done to accommodate the edition's breaking changes. The PR has two commits: * the first commit updates storage crates to edition 2024 and appeases `cargo clippy` by changing code. i have accidentially ran the formatter on some files that had other edits. * the second commit performs a `cargo fmt` I would recommend a closer review of the first commit and a less close review of the second one (as it just runs `cargo fmt`). part of https://github.com/neondatabase/neon/issues/10918
1539 lines
53 KiB
Rust
1539 lines
53 KiB
Rust
use std::collections::HashMap;
|
|
use std::fmt::Debug;
|
|
|
|
use http_utils::error::ApiError;
|
|
use itertools::Itertools;
|
|
use pageserver_api::controller_api::AvailabilityZone;
|
|
use pageserver_api::models::PageserverUtilization;
|
|
use serde::Serialize;
|
|
use utils::id::NodeId;
|
|
|
|
use crate::metrics::NodeLabelGroup;
|
|
use crate::node::Node;
|
|
use crate::tenant_shard::TenantShard;
|
|
|
|
/// Scenarios in which we cannot find a suitable location for a tenant shard
|
|
#[derive(thiserror::Error, Debug)]
|
|
pub enum ScheduleError {
|
|
#[error("No pageservers found")]
|
|
NoPageservers,
|
|
#[error("No pageserver found matching constraint")]
|
|
ImpossibleConstraint,
|
|
}
|
|
|
|
impl From<ScheduleError> for ApiError {
|
|
fn from(value: ScheduleError) -> Self {
|
|
ApiError::Conflict(format!("Scheduling error: {}", value))
|
|
}
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
pub enum MaySchedule {
|
|
Yes(PageserverUtilization),
|
|
No,
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
pub(crate) struct SchedulerNode {
|
|
/// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
|
|
shard_count: usize,
|
|
/// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
|
|
attached_shard_count: usize,
|
|
/// How many shards have a location on this node (via [`crate::tenant_shard::IntentState`]) _and_ this node
|
|
/// is in their preferred AZ (i.e. this is their 'home' location)
|
|
home_shard_count: usize,
|
|
/// Availability zone id in which the node resides
|
|
az: AvailabilityZone,
|
|
|
|
/// Whether this node is currently elegible to have new shards scheduled (this is derived
|
|
/// from a node's availability state and scheduling policy).
|
|
may_schedule: MaySchedule,
|
|
}
|
|
|
|
pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized {
|
|
fn generate(
|
|
node_id: &NodeId,
|
|
node: &mut SchedulerNode,
|
|
preferred_az: &Option<AvailabilityZone>,
|
|
context: &ScheduleContext,
|
|
) -> Option<Self>;
|
|
|
|
/// Return a score that drops any components based on node utilization: this is useful
|
|
/// for finding scores for scheduling optimisation, when we want to avoid rescheduling
|
|
/// shards due to e.g. disk usage, to avoid flapping.
|
|
fn for_optimization(&self) -> Self;
|
|
|
|
fn is_overloaded(&self) -> bool;
|
|
fn node_id(&self) -> NodeId;
|
|
}
|
|
|
|
pub(crate) trait ShardTag {
|
|
type Score: NodeSchedulingScore;
|
|
}
|
|
|
|
pub(crate) struct AttachedShardTag {}
|
|
impl ShardTag for AttachedShardTag {
|
|
type Score = NodeAttachmentSchedulingScore;
|
|
}
|
|
|
|
pub(crate) struct SecondaryShardTag {}
|
|
impl ShardTag for SecondaryShardTag {
|
|
type Score = NodeSecondarySchedulingScore;
|
|
}
|
|
|
|
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
|
|
enum AzMatch {
|
|
Yes,
|
|
No,
|
|
Unknown,
|
|
}
|
|
|
|
impl AzMatch {
|
|
fn new(node_az: &AvailabilityZone, shard_preferred_az: Option<&AvailabilityZone>) -> Self {
|
|
match shard_preferred_az {
|
|
Some(preferred_az) if preferred_az == node_az => Self::Yes,
|
|
Some(_preferred_az) => Self::No,
|
|
None => Self::Unknown,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
|
|
struct AttachmentAzMatch(AzMatch);
|
|
|
|
impl Ord for AttachmentAzMatch {
|
|
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
|
// Lower scores indicate a more suitable node.
|
|
// Note that we prefer a node for which we don't have
|
|
// info to a node which we are certain doesn't match the
|
|
// preferred AZ of the shard.
|
|
let az_match_score = |az_match: &AzMatch| match az_match {
|
|
AzMatch::Yes => 0,
|
|
AzMatch::Unknown => 1,
|
|
AzMatch::No => 2,
|
|
};
|
|
|
|
az_match_score(&self.0).cmp(&az_match_score(&other.0))
|
|
}
|
|
}
|
|
|
|
impl PartialOrd for AttachmentAzMatch {
|
|
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
|
Some(self.cmp(other))
|
|
}
|
|
}
|
|
|
|
#[derive(PartialEq, Eq, Debug, Clone, Copy)]
|
|
struct SecondaryAzMatch(AzMatch);
|
|
|
|
impl Ord for SecondaryAzMatch {
|
|
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
|
// Lower scores indicate a more suitable node.
|
|
// For secondary locations we wish to avoid the preferred AZ
|
|
// of the shard.
|
|
let az_match_score = |az_match: &AzMatch| match az_match {
|
|
AzMatch::No => 0,
|
|
AzMatch::Unknown => 1,
|
|
AzMatch::Yes => 2,
|
|
};
|
|
|
|
az_match_score(&self.0).cmp(&az_match_score(&other.0))
|
|
}
|
|
}
|
|
|
|
impl PartialOrd for SecondaryAzMatch {
|
|
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
|
Some(self.cmp(other))
|
|
}
|
|
}
|
|
|
|
/// Scheduling score of a given node for shard attachments.
|
|
/// Lower scores indicate more suitable nodes.
|
|
/// Ordering is given by member declaration order (top to bottom).
|
|
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
|
pub(crate) struct NodeAttachmentSchedulingScore {
|
|
/// Flag indicating whether this node matches the preferred AZ
|
|
/// of the shard. For equal affinity scores, nodes in the matching AZ
|
|
/// are considered first.
|
|
az_match: AttachmentAzMatch,
|
|
/// The number of shards belonging to the tenant currently being
|
|
/// scheduled that are attached to this node.
|
|
affinity_score: AffinityScore,
|
|
/// Utilisation score that combines shard count and disk utilisation
|
|
utilization_score: u64,
|
|
/// Total number of shards attached to this node. When nodes have identical utilisation, this
|
|
/// acts as an anti-affinity between attached shards.
|
|
total_attached_shard_count: usize,
|
|
/// Convenience to make selection deterministic in tests and empty systems
|
|
node_id: NodeId,
|
|
}
|
|
|
|
impl NodeSchedulingScore for NodeAttachmentSchedulingScore {
|
|
fn generate(
|
|
node_id: &NodeId,
|
|
node: &mut SchedulerNode,
|
|
preferred_az: &Option<AvailabilityZone>,
|
|
context: &ScheduleContext,
|
|
) -> Option<Self> {
|
|
let utilization = match &mut node.may_schedule {
|
|
MaySchedule::Yes(u) => u,
|
|
MaySchedule::No => {
|
|
return None;
|
|
}
|
|
};
|
|
|
|
Some(Self {
|
|
affinity_score: context
|
|
.nodes
|
|
.get(node_id)
|
|
.copied()
|
|
.unwrap_or(AffinityScore::FREE),
|
|
az_match: AttachmentAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
|
|
utilization_score: utilization.cached_score(),
|
|
total_attached_shard_count: node.attached_shard_count,
|
|
node_id: *node_id,
|
|
})
|
|
}
|
|
|
|
/// For use in scheduling optimisation, where we only want to consider the aspects
|
|
/// of the score that can only be resolved by moving things (such as inter-shard affinity
|
|
/// and AZ affinity), and ignore aspects that reflect the total utilization of a node (which
|
|
/// can fluctuate for other reasons)
|
|
fn for_optimization(&self) -> Self {
|
|
Self {
|
|
utilization_score: 0,
|
|
total_attached_shard_count: 0,
|
|
node_id: NodeId(0),
|
|
..*self
|
|
}
|
|
}
|
|
|
|
fn is_overloaded(&self) -> bool {
|
|
PageserverUtilization::is_overloaded(self.utilization_score)
|
|
}
|
|
|
|
fn node_id(&self) -> NodeId {
|
|
self.node_id
|
|
}
|
|
}
|
|
|
|
/// Scheduling score of a given node for shard secondaries.
|
|
/// Lower scores indicate more suitable nodes.
|
|
/// Ordering is given by member declaration order (top to bottom).
|
|
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
|
pub(crate) struct NodeSecondarySchedulingScore {
|
|
/// Flag indicating whether this node matches the preferred AZ
|
|
/// of the shard. For secondary locations we wish to avoid nodes in.
|
|
/// the preferred AZ of the shard, since that's where the attached location
|
|
/// should be scheduled and having the secondary in the same AZ is bad for HA.
|
|
az_match: SecondaryAzMatch,
|
|
/// The number of shards belonging to the tenant currently being
|
|
/// scheduled that are attached to this node.
|
|
affinity_score: AffinityScore,
|
|
/// Utilisation score that combines shard count and disk utilisation
|
|
utilization_score: u64,
|
|
/// Anti-affinity with other non-home locations: this gives the behavior that secondaries
|
|
/// will spread out across the nodes in an AZ.
|
|
total_non_home_shard_count: usize,
|
|
/// Convenience to make selection deterministic in tests and empty systems
|
|
node_id: NodeId,
|
|
}
|
|
|
|
impl NodeSchedulingScore for NodeSecondarySchedulingScore {
|
|
fn generate(
|
|
node_id: &NodeId,
|
|
node: &mut SchedulerNode,
|
|
preferred_az: &Option<AvailabilityZone>,
|
|
context: &ScheduleContext,
|
|
) -> Option<Self> {
|
|
let utilization = match &mut node.may_schedule {
|
|
MaySchedule::Yes(u) => u,
|
|
MaySchedule::No => {
|
|
return None;
|
|
}
|
|
};
|
|
|
|
Some(Self {
|
|
az_match: SecondaryAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
|
|
affinity_score: context
|
|
.nodes
|
|
.get(node_id)
|
|
.copied()
|
|
.unwrap_or(AffinityScore::FREE),
|
|
utilization_score: utilization.cached_score(),
|
|
total_non_home_shard_count: (node.shard_count - node.home_shard_count),
|
|
node_id: *node_id,
|
|
})
|
|
}
|
|
|
|
fn for_optimization(&self) -> Self {
|
|
Self {
|
|
utilization_score: 0,
|
|
total_non_home_shard_count: 0,
|
|
node_id: NodeId(0),
|
|
..*self
|
|
}
|
|
}
|
|
|
|
fn is_overloaded(&self) -> bool {
|
|
PageserverUtilization::is_overloaded(self.utilization_score)
|
|
}
|
|
|
|
fn node_id(&self) -> NodeId {
|
|
self.node_id
|
|
}
|
|
}
|
|
|
|
impl PartialEq for SchedulerNode {
|
|
fn eq(&self, other: &Self) -> bool {
|
|
let may_schedule_matches = matches!(
|
|
(&self.may_schedule, &other.may_schedule),
|
|
(MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
|
|
);
|
|
|
|
may_schedule_matches
|
|
&& self.shard_count == other.shard_count
|
|
&& self.attached_shard_count == other.attached_shard_count
|
|
&& self.az == other.az
|
|
}
|
|
}
|
|
|
|
impl Eq for SchedulerNode {}
|
|
|
|
/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
|
|
/// on which to run.
|
|
///
|
|
/// The type has no persistent state of its own: this is all populated at startup. The Serialize
|
|
/// impl is only for debug dumps.
|
|
#[derive(Serialize)]
|
|
pub(crate) struct Scheduler {
|
|
nodes: HashMap<NodeId, SchedulerNode>,
|
|
}
|
|
|
|
/// Score for soft constraint scheduling: lower scores are preferred to higher scores.
|
|
///
|
|
/// For example, we may set an affinity score based on the number of shards from the same
|
|
/// tenant already on a node, to implicitly prefer to balance out shards.
|
|
#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
|
|
pub(crate) struct AffinityScore(pub(crate) usize);
|
|
|
|
impl AffinityScore {
|
|
/// If we have no anti-affinity at all toward a node, this is its score. It means
|
|
/// the scheduler has a free choice amongst nodes with this score, and may pick a node
|
|
/// based on other information such as total utilization.
|
|
pub(crate) const FREE: Self = Self(0);
|
|
|
|
pub(crate) fn inc(&mut self) {
|
|
self.0 += 1;
|
|
}
|
|
|
|
pub(crate) fn dec(&mut self) {
|
|
self.0 -= 1;
|
|
}
|
|
}
|
|
|
|
impl std::ops::Add for AffinityScore {
|
|
type Output = Self;
|
|
|
|
fn add(self, rhs: Self) -> Self::Output {
|
|
Self(self.0 + rhs.0)
|
|
}
|
|
}
|
|
|
|
/// Hint for whether this is a sincere attempt to schedule, or a speculative
|
|
/// check for where we _would_ schedule (done during optimization)
|
|
#[derive(Debug, Clone)]
|
|
pub(crate) enum ScheduleMode {
|
|
Normal,
|
|
Speculative,
|
|
}
|
|
|
|
impl Default for ScheduleMode {
|
|
fn default() -> Self {
|
|
Self::Normal
|
|
}
|
|
}
|
|
|
|
// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
|
|
// it for many shards in the same tenant.
|
|
#[derive(Debug, Default, Clone)]
|
|
pub(crate) struct ScheduleContext {
|
|
/// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
|
|
pub(crate) nodes: HashMap<NodeId, AffinityScore>,
|
|
|
|
pub(crate) mode: ScheduleMode,
|
|
}
|
|
|
|
impl ScheduleContext {
|
|
pub(crate) fn new(mode: ScheduleMode) -> Self {
|
|
Self {
|
|
nodes: HashMap::new(),
|
|
mode,
|
|
}
|
|
}
|
|
|
|
/// Input is a list of nodes we would like to avoid using again within this context. The more
|
|
/// times a node is passed into this call, the less inclined we are to use it.
|
|
pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
|
|
for node_id in nodes {
|
|
let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
|
|
entry.inc()
|
|
}
|
|
}
|
|
|
|
/// Remove `shard`'s contributions to this context. This is useful when considering scheduling
|
|
/// this shard afresh, where we don't want it to e.g. experience anti-affinity to its current location.
|
|
pub(crate) fn project_detach(&self, shard: &TenantShard) -> Self {
|
|
let mut new_context = self.clone();
|
|
|
|
if let Some(attached) = shard.intent.get_attached() {
|
|
if let Some(score) = new_context.nodes.get_mut(attached) {
|
|
score.dec();
|
|
}
|
|
}
|
|
|
|
for secondary in shard.intent.get_secondary() {
|
|
if let Some(score) = new_context.nodes.get_mut(secondary) {
|
|
score.dec();
|
|
}
|
|
}
|
|
|
|
new_context
|
|
}
|
|
|
|
/// For test, track the sum of AffinityScore values, which is effectively how many
|
|
/// attached or secondary locations have been registered with this context.
|
|
#[cfg(test)]
|
|
pub(crate) fn location_count(&self) -> usize {
|
|
self.nodes.values().map(|i| i.0).sum()
|
|
}
|
|
}
|
|
|
|
pub(crate) enum RefCountUpdate {
|
|
PromoteSecondary,
|
|
Attach,
|
|
Detach,
|
|
DemoteAttached,
|
|
AddSecondary,
|
|
RemoveSecondary,
|
|
}
|
|
|
|
impl Scheduler {
|
|
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
|
|
let mut scheduler_nodes = HashMap::new();
|
|
for node in nodes {
|
|
scheduler_nodes.insert(
|
|
node.get_id(),
|
|
SchedulerNode {
|
|
shard_count: 0,
|
|
attached_shard_count: 0,
|
|
home_shard_count: 0,
|
|
may_schedule: node.may_schedule(),
|
|
az: node.get_availability_zone_id().clone(),
|
|
},
|
|
);
|
|
}
|
|
|
|
Self {
|
|
nodes: scheduler_nodes,
|
|
}
|
|
}
|
|
|
|
/// For debug/support: check that our internal statistics are in sync with the state of
|
|
/// the nodes & tenant shards.
|
|
///
|
|
/// If anything is inconsistent, log details and return an error.
|
|
pub(crate) fn consistency_check<'a>(
|
|
&self,
|
|
nodes: impl Iterator<Item = &'a Node>,
|
|
shards: impl Iterator<Item = &'a TenantShard>,
|
|
) -> anyhow::Result<()> {
|
|
let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
|
|
for node in nodes {
|
|
expect_nodes.insert(
|
|
node.get_id(),
|
|
SchedulerNode {
|
|
shard_count: 0,
|
|
attached_shard_count: 0,
|
|
home_shard_count: 0,
|
|
may_schedule: node.may_schedule(),
|
|
az: node.get_availability_zone_id().clone(),
|
|
},
|
|
);
|
|
}
|
|
|
|
for shard in shards {
|
|
if let Some(node_id) = shard.intent.get_attached() {
|
|
match expect_nodes.get_mut(node_id) {
|
|
Some(node) => {
|
|
node.shard_count += 1;
|
|
node.attached_shard_count += 1;
|
|
if Some(&node.az) == shard.preferred_az() {
|
|
node.home_shard_count += 1;
|
|
}
|
|
}
|
|
None => anyhow::bail!(
|
|
"Tenant {} references nonexistent node {}",
|
|
shard.tenant_shard_id,
|
|
node_id
|
|
),
|
|
}
|
|
}
|
|
|
|
for node_id in shard.intent.get_secondary() {
|
|
match expect_nodes.get_mut(node_id) {
|
|
Some(node) => {
|
|
node.shard_count += 1;
|
|
if Some(&node.az) == shard.preferred_az() {
|
|
node.home_shard_count += 1;
|
|
}
|
|
}
|
|
None => anyhow::bail!(
|
|
"Tenant {} references nonexistent node {}",
|
|
shard.tenant_shard_id,
|
|
node_id
|
|
),
|
|
}
|
|
}
|
|
}
|
|
|
|
for (node_id, expect_node) in &expect_nodes {
|
|
let Some(self_node) = self.nodes.get(node_id) else {
|
|
anyhow::bail!("Node {node_id} not found in Self")
|
|
};
|
|
|
|
if self_node != expect_node {
|
|
tracing::error!("Inconsistency detected in scheduling state for node {node_id}");
|
|
tracing::error!("Expected state: {}", serde_json::to_string(expect_node)?);
|
|
tracing::error!("Self state: {}", serde_json::to_string(self_node)?);
|
|
|
|
anyhow::bail!("Inconsistent state on {node_id}");
|
|
}
|
|
}
|
|
|
|
if expect_nodes.len() != self.nodes.len() {
|
|
// We just checked that all the expected nodes are present. If the lengths don't match,
|
|
// it means that we have nodes in Self that are unexpected.
|
|
for node_id in self.nodes.keys() {
|
|
if !expect_nodes.contains_key(node_id) {
|
|
anyhow::bail!("Node {node_id} found in Self but not in expected nodes");
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Update the reference counts of a node. These reference counts are used to guide scheduling
|
|
/// decisions, not for memory management: they represent the number of tenant shard whose IntentState
|
|
/// targets this node and the number of tenants shars whose IntentState is attached to this
|
|
/// node.
|
|
///
|
|
/// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
|
|
/// [`Self::new`] or [`Self::node_upsert`])
|
|
pub(crate) fn update_node_ref_counts(
|
|
&mut self,
|
|
node_id: NodeId,
|
|
preferred_az: Option<&AvailabilityZone>,
|
|
update: RefCountUpdate,
|
|
) {
|
|
let Some(node) = self.nodes.get_mut(&node_id) else {
|
|
debug_assert!(false);
|
|
tracing::error!("Scheduler missing node {node_id}");
|
|
return;
|
|
};
|
|
|
|
let is_home_az = Some(&node.az) == preferred_az;
|
|
|
|
match update {
|
|
RefCountUpdate::PromoteSecondary => {
|
|
node.attached_shard_count += 1;
|
|
}
|
|
RefCountUpdate::Attach => {
|
|
node.shard_count += 1;
|
|
node.attached_shard_count += 1;
|
|
if is_home_az {
|
|
node.home_shard_count += 1;
|
|
}
|
|
}
|
|
RefCountUpdate::Detach => {
|
|
node.shard_count -= 1;
|
|
node.attached_shard_count -= 1;
|
|
if is_home_az {
|
|
node.home_shard_count -= 1;
|
|
}
|
|
}
|
|
RefCountUpdate::DemoteAttached => {
|
|
node.attached_shard_count -= 1;
|
|
}
|
|
RefCountUpdate::AddSecondary => {
|
|
node.shard_count += 1;
|
|
if is_home_az {
|
|
node.home_shard_count += 1;
|
|
}
|
|
}
|
|
RefCountUpdate::RemoveSecondary => {
|
|
node.shard_count -= 1;
|
|
if is_home_az {
|
|
node.home_shard_count -= 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Maybe update PageserverUtilization
|
|
match update {
|
|
RefCountUpdate::AddSecondary | RefCountUpdate::Attach => {
|
|
// Referencing the node: if this takes our shard_count above the utilzation structure's
|
|
// shard count, then artifically bump it: this ensures that the scheduler immediately
|
|
// recognizes that this node has more work on it, without waiting for the next heartbeat
|
|
// to update the utilization.
|
|
if let MaySchedule::Yes(utilization) = &mut node.may_schedule {
|
|
utilization.adjust_shard_count_max(node.shard_count as u32);
|
|
}
|
|
}
|
|
RefCountUpdate::PromoteSecondary
|
|
| RefCountUpdate::Detach
|
|
| RefCountUpdate::RemoveSecondary
|
|
| RefCountUpdate::DemoteAttached => {
|
|
// De-referencing the node: leave the utilization's shard_count at a stale higher
|
|
// value until some future heartbeat after we have physically removed this shard
|
|
// from the node: this prevents the scheduler over-optimistically trying to schedule
|
|
// more work onto the node before earlier detaches are done.
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if the number of shards attached to a given node is lagging below
|
|
// the cluster average. If that's the case, the node should be filled.
|
|
pub(crate) fn compute_fill_requirement(&self, node_id: NodeId) -> usize {
|
|
let Some(node) = self.nodes.get(&node_id) else {
|
|
debug_assert!(false);
|
|
tracing::error!("Scheduler missing node {node_id}");
|
|
return 0;
|
|
};
|
|
assert!(!self.nodes.is_empty());
|
|
let expected_attached_shards_per_node = self.expected_attached_shard_count();
|
|
|
|
for (node_id, node) in self.nodes.iter() {
|
|
tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node);
|
|
}
|
|
|
|
if node.attached_shard_count < expected_attached_shards_per_node {
|
|
expected_attached_shards_per_node - node.attached_shard_count
|
|
} else {
|
|
0
|
|
}
|
|
}
|
|
|
|
pub(crate) fn expected_attached_shard_count(&self) -> usize {
|
|
let total_attached_shards: usize =
|
|
self.nodes.values().map(|n| n.attached_shard_count).sum();
|
|
|
|
assert!(!self.nodes.is_empty());
|
|
total_attached_shards / self.nodes.len()
|
|
}
|
|
|
|
pub(crate) fn nodes_by_attached_shard_count(&self) -> Vec<(NodeId, usize)> {
|
|
self.nodes
|
|
.iter()
|
|
.map(|(node_id, stats)| (*node_id, stats.attached_shard_count))
|
|
.sorted_by(|lhs, rhs| Ord::cmp(&lhs.1, &rhs.1).reverse())
|
|
.collect()
|
|
}
|
|
|
|
pub(crate) fn node_upsert(&mut self, node: &Node) {
|
|
use std::collections::hash_map::Entry::*;
|
|
match self.nodes.entry(node.get_id()) {
|
|
Occupied(mut entry) => {
|
|
// Updates to MaySchedule are how we receive updated PageserverUtilization: adjust these values
|
|
// to account for any shards scheduled on the controller but not yet visible to the pageserver.
|
|
let mut may_schedule = node.may_schedule();
|
|
match &mut may_schedule {
|
|
MaySchedule::Yes(utilization) => {
|
|
utilization.adjust_shard_count_max(entry.get().shard_count as u32);
|
|
}
|
|
MaySchedule::No => { // Nothing to tweak
|
|
}
|
|
}
|
|
|
|
entry.get_mut().may_schedule = may_schedule;
|
|
}
|
|
Vacant(entry) => {
|
|
entry.insert(SchedulerNode {
|
|
shard_count: 0,
|
|
attached_shard_count: 0,
|
|
home_shard_count: 0,
|
|
may_schedule: node.may_schedule(),
|
|
az: node.get_availability_zone_id().clone(),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
pub(crate) fn node_remove(&mut self, node_id: NodeId) {
|
|
if self.nodes.remove(&node_id).is_none() {
|
|
tracing::warn!(node_id=%node_id, "Removed non-existent node from scheduler");
|
|
}
|
|
}
|
|
|
|
/// Calculate a single node's score, used in optimizer logic to compare specific
|
|
/// nodes' scores.
|
|
pub(crate) fn compute_node_score<Score>(
|
|
&mut self,
|
|
node_id: NodeId,
|
|
preferred_az: &Option<AvailabilityZone>,
|
|
context: &ScheduleContext,
|
|
) -> Option<Score>
|
|
where
|
|
Score: NodeSchedulingScore,
|
|
{
|
|
self.nodes
|
|
.get_mut(&node_id)
|
|
.and_then(|node| Score::generate(&node_id, node, preferred_az, context))
|
|
}
|
|
|
|
/// Compute a schedulling score for each node that the scheduler knows of
|
|
/// minus a set of hard excluded nodes.
|
|
fn compute_node_scores<Score>(
|
|
&mut self,
|
|
hard_exclude: &[NodeId],
|
|
preferred_az: &Option<AvailabilityZone>,
|
|
context: &ScheduleContext,
|
|
) -> Vec<Score>
|
|
where
|
|
Score: NodeSchedulingScore,
|
|
{
|
|
self.nodes
|
|
.iter_mut()
|
|
.filter_map(|(k, v)| {
|
|
if hard_exclude.contains(k) {
|
|
None
|
|
} else {
|
|
Score::generate(k, v, preferred_az, context)
|
|
}
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
/// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
|
|
/// are already in use by this shard -- we use this to avoid picking the same node
|
|
/// as both attached and secondary location. This is a hard constraint: if we cannot
|
|
/// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
|
|
///
|
|
/// context: we prefer to avoid using nodes identified in the context, according
|
|
/// to their anti-affinity score. We use this to prefeer to avoid placing shards in
|
|
/// the same tenant on the same node. This is a soft constraint: the context will never
|
|
/// cause us to fail to schedule a shard.
|
|
pub(crate) fn schedule_shard<Tag: ShardTag>(
|
|
&mut self,
|
|
hard_exclude: &[NodeId],
|
|
preferred_az: &Option<AvailabilityZone>,
|
|
context: &ScheduleContext,
|
|
) -> Result<NodeId, ScheduleError> {
|
|
if self.nodes.is_empty() {
|
|
return Err(ScheduleError::NoPageservers);
|
|
}
|
|
|
|
let mut scores =
|
|
self.compute_node_scores::<Tag::Score>(hard_exclude, preferred_az, context);
|
|
|
|
// Exclude nodes whose utilization is critically high, if there are alternatives available. This will
|
|
// cause us to violate affinity rules if it is necessary to avoid critically overloading nodes: for example
|
|
// we may place shards in the same tenant together on the same pageserver if all other pageservers are
|
|
// overloaded.
|
|
let non_overloaded_scores = scores
|
|
.iter()
|
|
.filter(|i| !i.is_overloaded())
|
|
.copied()
|
|
.collect::<Vec<_>>();
|
|
if !non_overloaded_scores.is_empty() {
|
|
scores = non_overloaded_scores;
|
|
}
|
|
|
|
// Sort the nodes by score. The one with the lowest scores will be the preferred node.
|
|
// Refer to [`NodeAttachmentSchedulingScore`] for attached locations and
|
|
// [`NodeSecondarySchedulingScore`] for secondary locations to understand how the nodes
|
|
// are ranked.
|
|
scores.sort();
|
|
|
|
if scores.is_empty() {
|
|
// After applying constraints, no pageservers were left.
|
|
if !matches!(context.mode, ScheduleMode::Speculative) {
|
|
// If this was not a speculative attempt, log details to understand why we couldn't
|
|
// schedule: this may help an engineer understand if some nodes are marked offline
|
|
// in a way that's preventing progress.
|
|
tracing::info!(
|
|
"Scheduling failure, while excluding {hard_exclude:?}, node states:"
|
|
);
|
|
for (node_id, node) in &self.nodes {
|
|
tracing::info!(
|
|
"Node {node_id}: may_schedule={} shards={}",
|
|
!matches!(node.may_schedule, MaySchedule::No),
|
|
node.shard_count
|
|
);
|
|
}
|
|
}
|
|
return Err(ScheduleError::ImpossibleConstraint);
|
|
}
|
|
|
|
// Lowest score wins
|
|
let node_id = scores.first().unwrap().node_id();
|
|
|
|
if !matches!(context.mode, ScheduleMode::Speculative) {
|
|
tracing::info!(
|
|
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?}, preferred_az: {:?})",
|
|
scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>(),
|
|
preferred_az,
|
|
);
|
|
}
|
|
|
|
// Note that we do not update shard count here to reflect the scheduling: that
|
|
// is IntentState's job when the scheduled location is used.
|
|
|
|
Ok(node_id)
|
|
}
|
|
|
|
/// Selects any available node. This is suitable for performing background work (e.g. S3
|
|
/// deletions).
|
|
pub(crate) fn any_available_node(&mut self) -> Result<NodeId, ScheduleError> {
|
|
self.schedule_shard::<AttachedShardTag>(&[], &None, &ScheduleContext::default())
|
|
}
|
|
|
|
/// For choosing which AZ to schedule a new shard into, use this. It will return the
|
|
/// AZ with the the lowest number of shards currently scheduled in this AZ as their home
|
|
/// location.
|
|
///
|
|
/// We use an AZ-wide measure rather than simply selecting the AZ of the least-loaded
|
|
/// node, because while tenants start out single sharded, when they grow and undergo
|
|
/// shard-split, they will occupy space on many nodes within an AZ. It is important
|
|
/// that we pick the AZ in a way that balances this _future_ load.
|
|
///
|
|
/// Once we've picked an AZ, subsequent scheduling within that AZ will be driven by
|
|
/// nodes' utilization scores.
|
|
pub(crate) fn get_az_for_new_tenant(&self) -> Option<AvailabilityZone> {
|
|
if self.nodes.is_empty() {
|
|
return None;
|
|
}
|
|
|
|
#[derive(Default)]
|
|
struct AzScore {
|
|
home_shard_count: usize,
|
|
scheduleable: bool,
|
|
}
|
|
|
|
let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new();
|
|
for node in self.nodes.values() {
|
|
let az = azs.entry(&node.az).or_default();
|
|
az.home_shard_count += node.home_shard_count;
|
|
az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_));
|
|
}
|
|
|
|
// If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where
|
|
// all nodes are overloaded or otherwise unschedulable).
|
|
if azs.values().any(|i| i.scheduleable) {
|
|
azs.retain(|_, i| i.scheduleable);
|
|
}
|
|
|
|
// Find the AZ with the lowest number of shards currently allocated
|
|
Some(
|
|
azs.into_iter()
|
|
.min_by_key(|i| (i.1.home_shard_count, i.0))
|
|
.unwrap()
|
|
.0
|
|
.clone(),
|
|
)
|
|
}
|
|
|
|
pub(crate) fn get_node_az(&self, node_id: &NodeId) -> Option<AvailabilityZone> {
|
|
self.nodes.get(node_id).map(|n| n.az.clone())
|
|
}
|
|
|
|
/// For use when choosing a preferred secondary location: filter out nodes that are not
|
|
/// available, and gather their AZs.
|
|
pub(crate) fn filter_usable_nodes(
|
|
&self,
|
|
nodes: &[NodeId],
|
|
) -> Vec<(NodeId, Option<AvailabilityZone>)> {
|
|
nodes
|
|
.iter()
|
|
.filter_map(|node_id| {
|
|
let node = self
|
|
.nodes
|
|
.get(node_id)
|
|
.expect("Referenced nodes always exist");
|
|
if matches!(node.may_schedule, MaySchedule::Yes(_)) {
|
|
Some((*node_id, Some(node.az.clone())))
|
|
} else {
|
|
None
|
|
}
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
/// Unit test access to internal state
|
|
#[cfg(test)]
|
|
pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
|
|
self.nodes.get(&node_id).unwrap().shard_count
|
|
}
|
|
|
|
#[cfg(test)]
|
|
pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize {
|
|
self.nodes.get(&node_id).unwrap().attached_shard_count
|
|
}
|
|
|
|
/// Some metrics that we only calculate periodically: this is simpler than
|
|
/// rigorously updating them on every change.
|
|
pub(crate) fn update_metrics(&self) {
|
|
for (node_id, node) in &self.nodes {
|
|
let node_id_str = format!("{}", node_id);
|
|
let label_group = NodeLabelGroup {
|
|
az: &node.az.0,
|
|
node_id: &node_id_str,
|
|
};
|
|
|
|
crate::metrics::METRICS_REGISTRY
|
|
.metrics_group
|
|
.storage_controller_node_shards
|
|
.set(label_group.clone(), node.shard_count as i64);
|
|
|
|
crate::metrics::METRICS_REGISTRY
|
|
.metrics_group
|
|
.storage_controller_node_attached_shards
|
|
.set(label_group.clone(), node.attached_shard_count as i64);
|
|
|
|
crate::metrics::METRICS_REGISTRY
|
|
.metrics_group
|
|
.storage_controller_node_home_shards
|
|
.set(label_group.clone(), node.home_shard_count as i64);
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
pub(crate) mod test_utils {
|
|
|
|
use std::collections::HashMap;
|
|
|
|
use pageserver_api::controller_api::{AvailabilityZone, NodeAvailability};
|
|
use pageserver_api::models::utilization::test_utilization;
|
|
use utils::id::NodeId;
|
|
|
|
use crate::node::Node;
|
|
|
|
/// Test helper: synthesize the requested number of nodes, all in active state.
|
|
///
|
|
/// Node IDs start at one.
|
|
///
|
|
/// The `azs` argument specifies the list of availability zones which will be assigned
|
|
/// to nodes in round-robin fashion. If empy, a default AZ is assigned.
|
|
pub(crate) fn make_test_nodes(n: u64, azs: &[AvailabilityZone]) -> HashMap<NodeId, Node> {
|
|
let mut az_iter = azs.iter().cycle();
|
|
|
|
(1..n + 1)
|
|
.map(|i| {
|
|
(NodeId(i), {
|
|
let mut node = Node::new(
|
|
NodeId(i),
|
|
format!("httphost-{i}"),
|
|
80 + i as u16,
|
|
None,
|
|
format!("pghost-{i}"),
|
|
5432 + i as u16,
|
|
az_iter
|
|
.next()
|
|
.cloned()
|
|
.unwrap_or(AvailabilityZone("test-az".to_string())),
|
|
false,
|
|
)
|
|
.unwrap();
|
|
node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
|
|
assert!(node.is_available());
|
|
node
|
|
})
|
|
})
|
|
.collect()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use pageserver_api::controller_api::NodeAvailability;
|
|
use pageserver_api::models::utilization::test_utilization;
|
|
use pageserver_api::shard::ShardIdentity;
|
|
use utils::id::TenantId;
|
|
use utils::shard::{ShardCount, ShardNumber, TenantShardId};
|
|
|
|
use super::*;
|
|
use crate::tenant_shard::IntentState;
|
|
#[test]
|
|
fn scheduler_basic() -> anyhow::Result<()> {
|
|
let nodes = test_utils::make_test_nodes(2, &[]);
|
|
|
|
let mut scheduler = Scheduler::new(nodes.values());
|
|
let mut t1_intent = IntentState::new(None);
|
|
let mut t2_intent = IntentState::new(None);
|
|
|
|
let context = ScheduleContext::default();
|
|
|
|
let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &None, &context)?;
|
|
t1_intent.set_attached(&mut scheduler, Some(scheduled));
|
|
let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &None, &context)?;
|
|
t2_intent.set_attached(&mut scheduler, Some(scheduled));
|
|
|
|
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
|
|
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
|
|
|
|
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
|
|
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
|
|
|
|
let scheduled = scheduler.schedule_shard::<AttachedShardTag>(
|
|
&t1_intent.all_pageservers(),
|
|
&None,
|
|
&context,
|
|
)?;
|
|
t1_intent.push_secondary(&mut scheduler, scheduled);
|
|
|
|
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
|
|
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1);
|
|
|
|
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
|
|
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
|
|
|
|
t1_intent.clear(&mut scheduler);
|
|
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0);
|
|
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
|
|
|
|
let total_attached = scheduler.get_node_attached_shard_count(NodeId(1))
|
|
+ scheduler.get_node_attached_shard_count(NodeId(2));
|
|
assert_eq!(total_attached, 1);
|
|
|
|
if cfg!(debug_assertions) {
|
|
// Dropping an IntentState without clearing it causes a panic in debug mode,
|
|
// because we have failed to properly update scheduler shard counts.
|
|
let result = std::panic::catch_unwind(move || {
|
|
drop(t2_intent);
|
|
});
|
|
assert!(result.is_err());
|
|
} else {
|
|
t2_intent.clear(&mut scheduler);
|
|
|
|
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0);
|
|
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 0);
|
|
|
|
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 0);
|
|
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
/// Test the PageserverUtilization's contribution to scheduling algorithm
|
|
fn scheduler_utilization() {
|
|
let mut nodes = test_utils::make_test_nodes(3, &[]);
|
|
let mut scheduler = Scheduler::new(nodes.values());
|
|
|
|
// Need to keep these alive because they contribute to shard counts via RAII
|
|
let mut scheduled_intents = Vec::new();
|
|
|
|
let empty_context = ScheduleContext::default();
|
|
|
|
fn assert_scheduler_chooses(
|
|
expect_node: NodeId,
|
|
scheduled_intents: &mut Vec<IntentState>,
|
|
scheduler: &mut Scheduler,
|
|
context: &ScheduleContext,
|
|
) {
|
|
let scheduled = scheduler
|
|
.schedule_shard::<AttachedShardTag>(&[], &None, context)
|
|
.unwrap();
|
|
let mut intent = IntentState::new(None);
|
|
intent.set_attached(scheduler, Some(scheduled));
|
|
scheduled_intents.push(intent);
|
|
assert_eq!(scheduled, expect_node);
|
|
}
|
|
|
|
// Independent schedule calls onto empty nodes should round-robin, because each node's
|
|
// utilization's shard count is updated inline. The order is determinsitic because when all other factors are
|
|
// equal, we order by node ID.
|
|
assert_scheduler_chooses(
|
|
NodeId(1),
|
|
&mut scheduled_intents,
|
|
&mut scheduler,
|
|
&empty_context,
|
|
);
|
|
assert_scheduler_chooses(
|
|
NodeId(2),
|
|
&mut scheduled_intents,
|
|
&mut scheduler,
|
|
&empty_context,
|
|
);
|
|
assert_scheduler_chooses(
|
|
NodeId(3),
|
|
&mut scheduled_intents,
|
|
&mut scheduler,
|
|
&empty_context,
|
|
);
|
|
|
|
// Manually setting utilization higher should cause schedule calls to round-robin the other nodes
|
|
// which have equal utilization.
|
|
nodes
|
|
.get_mut(&NodeId(1))
|
|
.unwrap()
|
|
.set_availability(NodeAvailability::Active(test_utilization::simple(
|
|
10,
|
|
1024 * 1024 * 1024,
|
|
)));
|
|
scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
|
|
|
|
assert_scheduler_chooses(
|
|
NodeId(2),
|
|
&mut scheduled_intents,
|
|
&mut scheduler,
|
|
&empty_context,
|
|
);
|
|
assert_scheduler_chooses(
|
|
NodeId(3),
|
|
&mut scheduled_intents,
|
|
&mut scheduler,
|
|
&empty_context,
|
|
);
|
|
assert_scheduler_chooses(
|
|
NodeId(2),
|
|
&mut scheduled_intents,
|
|
&mut scheduler,
|
|
&empty_context,
|
|
);
|
|
assert_scheduler_chooses(
|
|
NodeId(3),
|
|
&mut scheduled_intents,
|
|
&mut scheduler,
|
|
&empty_context,
|
|
);
|
|
|
|
// The scheduler should prefer nodes with lower affinity score,
|
|
// even if they have higher utilization (as long as they aren't utilized at >100%)
|
|
let mut context_prefer_node1 = ScheduleContext::default();
|
|
context_prefer_node1.avoid(&[NodeId(2), NodeId(3)]);
|
|
assert_scheduler_chooses(
|
|
NodeId(1),
|
|
&mut scheduled_intents,
|
|
&mut scheduler,
|
|
&context_prefer_node1,
|
|
);
|
|
assert_scheduler_chooses(
|
|
NodeId(1),
|
|
&mut scheduled_intents,
|
|
&mut scheduler,
|
|
&context_prefer_node1,
|
|
);
|
|
|
|
// If a node is over-utilized, it will not be used even if affinity scores prefer it
|
|
nodes
|
|
.get_mut(&NodeId(1))
|
|
.unwrap()
|
|
.set_availability(NodeAvailability::Active(test_utilization::simple(
|
|
20000,
|
|
1024 * 1024 * 1024,
|
|
)));
|
|
scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
|
|
assert_scheduler_chooses(
|
|
NodeId(2),
|
|
&mut scheduled_intents,
|
|
&mut scheduler,
|
|
&context_prefer_node1,
|
|
);
|
|
assert_scheduler_chooses(
|
|
NodeId(3),
|
|
&mut scheduled_intents,
|
|
&mut scheduler,
|
|
&context_prefer_node1,
|
|
);
|
|
|
|
for mut intent in scheduled_intents {
|
|
intent.clear(&mut scheduler);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
/// A simple test that showcases AZ-aware scheduling and its interaction with
|
|
/// affinity scores.
|
|
fn az_scheduling() {
|
|
let az_a_tag = AvailabilityZone("az-a".to_string());
|
|
let az_b_tag = AvailabilityZone("az-b".to_string());
|
|
|
|
let nodes = test_utils::make_test_nodes(3, &[az_a_tag.clone(), az_b_tag.clone()]);
|
|
let mut scheduler = Scheduler::new(nodes.values());
|
|
|
|
// Need to keep these alive because they contribute to shard counts via RAII
|
|
let mut scheduled_intents = Vec::new();
|
|
|
|
let mut context = ScheduleContext::default();
|
|
|
|
fn assert_scheduler_chooses<Tag: ShardTag>(
|
|
expect_node: NodeId,
|
|
preferred_az: Option<AvailabilityZone>,
|
|
scheduled_intents: &mut Vec<IntentState>,
|
|
scheduler: &mut Scheduler,
|
|
context: &mut ScheduleContext,
|
|
) {
|
|
let scheduled = scheduler
|
|
.schedule_shard::<Tag>(&[], &preferred_az, context)
|
|
.unwrap();
|
|
let mut intent = IntentState::new(preferred_az.clone());
|
|
intent.set_attached(scheduler, Some(scheduled));
|
|
scheduled_intents.push(intent);
|
|
assert_eq!(scheduled, expect_node);
|
|
|
|
context.avoid(&[scheduled]);
|
|
}
|
|
|
|
assert_scheduler_chooses::<AttachedShardTag>(
|
|
NodeId(1),
|
|
Some(az_a_tag.clone()),
|
|
&mut scheduled_intents,
|
|
&mut scheduler,
|
|
&mut context,
|
|
);
|
|
|
|
// Node 2 and 3 have affinity score equal to 0, but node 3
|
|
// is in "az-a" so we prefer that.
|
|
assert_scheduler_chooses::<AttachedShardTag>(
|
|
NodeId(3),
|
|
Some(az_a_tag.clone()),
|
|
&mut scheduled_intents,
|
|
&mut scheduler,
|
|
&mut context,
|
|
);
|
|
|
|
// Node 1 and 3 (az-a) have same affinity score, so prefer the lowest node id.
|
|
assert_scheduler_chooses::<AttachedShardTag>(
|
|
NodeId(1),
|
|
Some(az_a_tag.clone()),
|
|
&mut scheduled_intents,
|
|
&mut scheduler,
|
|
&mut context,
|
|
);
|
|
|
|
// Avoid nodes in "az-a" for the secondary location.
|
|
assert_scheduler_chooses::<SecondaryShardTag>(
|
|
NodeId(2),
|
|
Some(az_a_tag.clone()),
|
|
&mut scheduled_intents,
|
|
&mut scheduler,
|
|
&mut context,
|
|
);
|
|
|
|
for mut intent in scheduled_intents {
|
|
intent.clear(&mut scheduler);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn az_scheduling_for_new_tenant() {
|
|
let az_a_tag = AvailabilityZone("az-a".to_string());
|
|
let az_b_tag = AvailabilityZone("az-b".to_string());
|
|
let nodes = test_utils::make_test_nodes(
|
|
6,
|
|
&[
|
|
az_a_tag.clone(),
|
|
az_a_tag.clone(),
|
|
az_a_tag.clone(),
|
|
az_b_tag.clone(),
|
|
az_b_tag.clone(),
|
|
az_b_tag.clone(),
|
|
],
|
|
);
|
|
|
|
let mut scheduler = Scheduler::new(nodes.values());
|
|
|
|
/// Force the `home_shard_count` of a node directly: this is the metric used
|
|
/// by the scheduler when picking AZs.
|
|
fn set_shard_count(scheduler: &mut Scheduler, node_id: NodeId, shard_count: usize) {
|
|
let node = scheduler.nodes.get_mut(&node_id).unwrap();
|
|
node.home_shard_count = shard_count;
|
|
}
|
|
|
|
// Initial empty state. Scores are tied, scheduler prefers lower AZ ID.
|
|
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
|
|
|
|
// Home shard count is higher in AZ A, so AZ B will be preferred
|
|
set_shard_count(&mut scheduler, NodeId(1), 10);
|
|
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone()));
|
|
|
|
// Total home shard count is higher in AZ B, so we revert to preferring AZ A
|
|
set_shard_count(&mut scheduler, NodeId(4), 6);
|
|
set_shard_count(&mut scheduler, NodeId(5), 6);
|
|
assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone()));
|
|
}
|
|
|
|
/// Test that when selecting AZs for many new tenants, we get the expected balance across nodes
|
|
#[test]
|
|
fn az_selection_many() {
|
|
let az_a_tag = AvailabilityZone("az-a".to_string());
|
|
let az_b_tag = AvailabilityZone("az-b".to_string());
|
|
let az_c_tag = AvailabilityZone("az-c".to_string());
|
|
let nodes = test_utils::make_test_nodes(
|
|
6,
|
|
&[
|
|
az_a_tag.clone(),
|
|
az_b_tag.clone(),
|
|
az_c_tag.clone(),
|
|
az_a_tag.clone(),
|
|
az_b_tag.clone(),
|
|
az_c_tag.clone(),
|
|
],
|
|
);
|
|
|
|
let mut scheduler = Scheduler::new(nodes.values());
|
|
|
|
// We should get 1/6th of these on each node, give or take a few...
|
|
let total_tenants = 300;
|
|
|
|
// ...where the 'few' is the number of AZs, because the scheduling will sometimes overshoot
|
|
// on one AZ before correcting itself. This is because we select the 'home' AZ based on
|
|
// an AZ-wide metric, but we select the location for secondaries on a purely node-based
|
|
// metric (while excluding the home AZ).
|
|
let grace = 3;
|
|
|
|
let mut scheduled_shards = Vec::new();
|
|
for _i in 0..total_tenants {
|
|
let preferred_az = scheduler.get_az_for_new_tenant().unwrap();
|
|
|
|
let mut node_home_counts = scheduler
|
|
.nodes
|
|
.iter()
|
|
.map(|(node_id, node)| (node_id, node.home_shard_count))
|
|
.collect::<Vec<_>>();
|
|
node_home_counts.sort_by_key(|i| i.0);
|
|
eprintln!("Selected {}, vs nodes {:?}", preferred_az, node_home_counts);
|
|
|
|
let tenant_shard_id = TenantShardId {
|
|
tenant_id: TenantId::generate(),
|
|
shard_number: ShardNumber(0),
|
|
shard_count: ShardCount(1),
|
|
};
|
|
|
|
let shard_identity = ShardIdentity::new(
|
|
tenant_shard_id.shard_number,
|
|
tenant_shard_id.shard_count,
|
|
pageserver_api::shard::ShardStripeSize(1),
|
|
)
|
|
.unwrap();
|
|
let mut shard = TenantShard::new(
|
|
tenant_shard_id,
|
|
shard_identity,
|
|
pageserver_api::controller_api::PlacementPolicy::Attached(1),
|
|
Some(preferred_az),
|
|
);
|
|
|
|
let mut context = ScheduleContext::default();
|
|
shard.schedule(&mut scheduler, &mut context).unwrap();
|
|
eprintln!("Scheduled shard at {:?}", shard.intent);
|
|
|
|
scheduled_shards.push(shard);
|
|
}
|
|
|
|
for (node_id, node) in &scheduler.nodes {
|
|
eprintln!(
|
|
"Node {}: {} {} {}",
|
|
node_id, node.shard_count, node.attached_shard_count, node.home_shard_count
|
|
);
|
|
}
|
|
|
|
for node in scheduler.nodes.values() {
|
|
assert!((node.home_shard_count as i64 - total_tenants as i64 / 6).abs() < grace);
|
|
}
|
|
|
|
for mut shard in scheduled_shards {
|
|
shard.intent.clear(&mut scheduler);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
/// Make sure that when we have an odd number of nodes and an even number of shards, we still
|
|
/// get scheduling stability.
|
|
fn odd_nodes_stability() {
|
|
let az_a = AvailabilityZone("az-a".to_string());
|
|
let az_b = AvailabilityZone("az-b".to_string());
|
|
|
|
let nodes = test_utils::make_test_nodes(
|
|
10,
|
|
&[
|
|
az_a.clone(),
|
|
az_a.clone(),
|
|
az_a.clone(),
|
|
az_a.clone(),
|
|
az_a.clone(),
|
|
az_b.clone(),
|
|
az_b.clone(),
|
|
az_b.clone(),
|
|
az_b.clone(),
|
|
az_b.clone(),
|
|
],
|
|
);
|
|
let mut scheduler = Scheduler::new(nodes.values());
|
|
|
|
// Need to keep these alive because they contribute to shard counts via RAII
|
|
let mut scheduled_shards = Vec::new();
|
|
|
|
let mut context = ScheduleContext::default();
|
|
|
|
fn schedule_shard(
|
|
tenant_shard_id: TenantShardId,
|
|
expect_attached: NodeId,
|
|
expect_secondary: NodeId,
|
|
scheduled_shards: &mut Vec<TenantShard>,
|
|
scheduler: &mut Scheduler,
|
|
preferred_az: Option<AvailabilityZone>,
|
|
context: &mut ScheduleContext,
|
|
) {
|
|
let shard_identity = ShardIdentity::new(
|
|
tenant_shard_id.shard_number,
|
|
tenant_shard_id.shard_count,
|
|
pageserver_api::shard::ShardStripeSize(1),
|
|
)
|
|
.unwrap();
|
|
let mut shard = TenantShard::new(
|
|
tenant_shard_id,
|
|
shard_identity,
|
|
pageserver_api::controller_api::PlacementPolicy::Attached(1),
|
|
preferred_az,
|
|
);
|
|
|
|
shard.schedule(scheduler, context).unwrap();
|
|
|
|
assert_eq!(shard.intent.get_attached().unwrap(), expect_attached);
|
|
assert_eq!(
|
|
shard.intent.get_secondary().first().unwrap(),
|
|
&expect_secondary
|
|
);
|
|
|
|
scheduled_shards.push(shard);
|
|
}
|
|
|
|
let tenant_id = TenantId::generate();
|
|
|
|
schedule_shard(
|
|
TenantShardId {
|
|
tenant_id,
|
|
shard_number: ShardNumber(0),
|
|
shard_count: ShardCount(8),
|
|
},
|
|
NodeId(1),
|
|
NodeId(6),
|
|
&mut scheduled_shards,
|
|
&mut scheduler,
|
|
Some(az_a.clone()),
|
|
&mut context,
|
|
);
|
|
|
|
schedule_shard(
|
|
TenantShardId {
|
|
tenant_id,
|
|
shard_number: ShardNumber(1),
|
|
shard_count: ShardCount(8),
|
|
},
|
|
NodeId(2),
|
|
NodeId(7),
|
|
&mut scheduled_shards,
|
|
&mut scheduler,
|
|
Some(az_a.clone()),
|
|
&mut context,
|
|
);
|
|
|
|
schedule_shard(
|
|
TenantShardId {
|
|
tenant_id,
|
|
shard_number: ShardNumber(2),
|
|
shard_count: ShardCount(8),
|
|
},
|
|
NodeId(3),
|
|
NodeId(8),
|
|
&mut scheduled_shards,
|
|
&mut scheduler,
|
|
Some(az_a.clone()),
|
|
&mut context,
|
|
);
|
|
|
|
schedule_shard(
|
|
TenantShardId {
|
|
tenant_id,
|
|
shard_number: ShardNumber(3),
|
|
shard_count: ShardCount(8),
|
|
},
|
|
NodeId(4),
|
|
NodeId(9),
|
|
&mut scheduled_shards,
|
|
&mut scheduler,
|
|
Some(az_a.clone()),
|
|
&mut context,
|
|
);
|
|
|
|
schedule_shard(
|
|
TenantShardId {
|
|
tenant_id,
|
|
shard_number: ShardNumber(4),
|
|
shard_count: ShardCount(8),
|
|
},
|
|
NodeId(5),
|
|
NodeId(10),
|
|
&mut scheduled_shards,
|
|
&mut scheduler,
|
|
Some(az_a.clone()),
|
|
&mut context,
|
|
);
|
|
|
|
schedule_shard(
|
|
TenantShardId {
|
|
tenant_id,
|
|
shard_number: ShardNumber(5),
|
|
shard_count: ShardCount(8),
|
|
},
|
|
NodeId(1),
|
|
NodeId(6),
|
|
&mut scheduled_shards,
|
|
&mut scheduler,
|
|
Some(az_a.clone()),
|
|
&mut context,
|
|
);
|
|
|
|
schedule_shard(
|
|
TenantShardId {
|
|
tenant_id,
|
|
shard_number: ShardNumber(6),
|
|
shard_count: ShardCount(8),
|
|
},
|
|
NodeId(2),
|
|
NodeId(7),
|
|
&mut scheduled_shards,
|
|
&mut scheduler,
|
|
Some(az_a.clone()),
|
|
&mut context,
|
|
);
|
|
|
|
schedule_shard(
|
|
TenantShardId {
|
|
tenant_id,
|
|
shard_number: ShardNumber(7),
|
|
shard_count: ShardCount(8),
|
|
},
|
|
NodeId(3),
|
|
NodeId(8),
|
|
&mut scheduled_shards,
|
|
&mut scheduler,
|
|
Some(az_a.clone()),
|
|
&mut context,
|
|
);
|
|
|
|
// Assert that the optimizer suggests nochanges, i.e. our initial scheduling was stable.
|
|
for shard in &scheduled_shards {
|
|
assert_eq!(shard.optimize_attachment(&mut scheduler, &context), None);
|
|
}
|
|
|
|
for mut shard in scheduled_shards {
|
|
shard.intent.clear(&mut scheduler);
|
|
}
|
|
}
|
|
}
|