use std::{ collections::{HashMap, HashSet}, sync::Arc, time::Duration, }; use crate::{ metrics::{ self, ReconcileCompleteLabelGroup, ReconcileLongRunningLabelGroup, ReconcileOutcome, }, persistence::TenantShardPersistence, reconciler::{ReconcileUnits, ReconcilerConfig}, scheduler::{ AffinityScore, AttachedShardTag, MaySchedule, RefCountUpdate, ScheduleContext, SecondaryShardTag, }, service::ReconcileResultRequest, }; use futures::future::{self, Either}; use itertools::Itertools; use pageserver_api::controller_api::{ AvailabilityZone, NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, }; use pageserver_api::{ models::{LocationConfig, LocationConfigMode, TenantConfig}, shard::{ShardIdentity, TenantShardId}, }; use serde::{Deserialize, Serialize}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::{instrument, Instrument}; use utils::{ generation::Generation, id::NodeId, seqwait::{SeqWait, SeqWaitError}, sync::gate::GateGuard, }; use crate::{ compute_hook::ComputeHook, node::Node, persistence::{split_state::SplitState, Persistence}, reconciler::{ attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState, }, scheduler::{ScheduleError, Scheduler}, service, Sequence, }; /// Serialization helper fn read_last_error(v: &std::sync::Mutex>, serializer: S) -> Result where S: serde::ser::Serializer, T: std::fmt::Display, { serializer.collect_str( &v.lock() .unwrap() .as_ref() .map(|e| format!("{e}")) .unwrap_or("".to_string()), ) } /// In-memory state for a particular tenant shard. /// /// This struct implement Serialize for debugging purposes, but is _not_ persisted /// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted. #[derive(Serialize)] pub(crate) struct TenantShard { pub(crate) tenant_shard_id: TenantShardId, pub(crate) shard: ShardIdentity, // Runtime only: sequence used to coordinate when updating this object while // with background reconcilers may be running. A reconciler runs to a particular // sequence. pub(crate) sequence: Sequence, // Latest generation number: next time we attach, increment this // and use the incremented number when attaching. // // None represents an incompletely onboarded tenant via the [`Service::location_config`] // API, where this tenant may only run in PlacementPolicy::Secondary. pub(crate) generation: Option, // High level description of how the tenant should be set up. Provided // externally. pub(crate) policy: PlacementPolicy, // Low level description of exactly which pageservers should fulfil // which role. Generated by `Self::schedule`. pub(crate) intent: IntentState, // Low level description of how the tenant is configured on pageservers: // if this does not match `Self::intent` then the tenant needs reconciliation // with `Self::reconcile`. pub(crate) observed: ObservedState, // Tenant configuration, passed through opaquely to the pageserver. Identical // for all shards in a tenant. pub(crate) config: TenantConfig, /// If a reconcile task is currently in flight, it may be joined here (it is /// only safe to join if either the result has been received or the reconciler's /// cancellation token has been fired) #[serde(skip)] pub(crate) reconciler: Option, /// If a tenant is being split, then all shards with that TenantId will have a /// SplitState set, this acts as a guard against other operations such as background /// reconciliation, and timeline creation. pub(crate) splitting: SplitState, /// If a tenant was enqueued for later reconcile due to hitting concurrency limit, this flag /// is set. This flag is cleared when the tenant is popped off the delay queue. pub(crate) delayed_reconcile: bool, /// Optionally wait for reconciliation to complete up to a particular /// sequence number. #[serde(skip)] pub(crate) waiter: std::sync::Arc>, /// Indicates sequence number for which we have encountered an error reconciling. If /// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred, /// and callers should stop waiting for `waiter` and propagate the error. #[serde(skip)] pub(crate) error_waiter: std::sync::Arc>, /// The most recent error from a reconcile on this tenant. This is a nested Arc /// because: /// - ReconcileWaiters need to Arc-clone the overall object to read it later /// - ReconcileWaitError needs to use an `Arc` because we can construct /// many waiters for one shard, and the underlying error types are not Clone. /// /// TODO: generalize to an array of recent events /// TOOD: use a ArcSwap instead of mutex for faster reads? #[serde(serialize_with = "read_last_error")] pub(crate) last_error: std::sync::Arc>>>, /// If we have a pending compute notification that for some reason we weren't able to send, /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes /// and trigger a Reconciler run. This is the mechanism by which compute notifications are included in the scope /// of state that we publish externally in an eventually consistent way. pub(crate) pending_compute_notification: bool, // Support/debug tool: if something is going wrong or flapping with scheduling, this may // be set to a non-active state to avoid making changes while the issue is fixed. scheduling_policy: ShardSchedulingPolicy, // We should attempt to schedule this shard in the provided AZ to // decrease chances of cross-AZ compute. preferred_az_id: Option, } #[derive(Default, Clone, Debug, Serialize)] pub(crate) struct IntentState { attached: Option, secondary: Vec, } impl IntentState { pub(crate) fn new() -> Self { Self { attached: None, secondary: vec![], } } pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option) -> Self { if let Some(node_id) = node_id { scheduler.update_node_ref_counts(node_id, RefCountUpdate::Attach); } Self { attached: node_id, secondary: vec![], } } pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option) { if self.attached != new_attached { if let Some(old_attached) = self.attached.take() { scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach); } if let Some(new_attached) = &new_attached { scheduler.update_node_ref_counts(*new_attached, RefCountUpdate::Attach); } self.attached = new_attached; } } /// Like set_attached, but the node is from [`Self::secondary`]. This swaps the node from /// secondary to attached while maintaining the scheduler's reference counts. pub(crate) fn promote_attached( &mut self, scheduler: &mut Scheduler, promote_secondary: NodeId, ) { // If we call this with a node that isn't in secondary, it would cause incorrect // scheduler reference counting, since we assume the node is already referenced as a secondary. debug_assert!(self.secondary.contains(&promote_secondary)); self.secondary.retain(|n| n != &promote_secondary); let demoted = self.attached; self.attached = Some(promote_secondary); scheduler.update_node_ref_counts(promote_secondary, RefCountUpdate::PromoteSecondary); if let Some(demoted) = demoted { scheduler.update_node_ref_counts(demoted, RefCountUpdate::DemoteAttached); } } pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) { debug_assert!(!self.secondary.contains(&new_secondary)); scheduler.update_node_ref_counts(new_secondary, RefCountUpdate::AddSecondary); self.secondary.push(new_secondary); } /// It is legal to call this with a node that is not currently a secondary: that is a no-op pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) { let index = self.secondary.iter().position(|n| *n == node_id); if let Some(index) = index { scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary); self.secondary.remove(index); } } pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) { for secondary in self.secondary.drain(..) { scheduler.update_node_ref_counts(secondary, RefCountUpdate::RemoveSecondary); } } /// Remove the last secondary node from the list of secondaries pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) { if let Some(node_id) = self.secondary.pop() { scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary); } } pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) { if let Some(old_attached) = self.attached.take() { scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach); } self.clear_secondary(scheduler); } pub(crate) fn all_pageservers(&self) -> Vec { let mut result = Vec::new(); if let Some(p) = self.attached { result.push(p) } result.extend(self.secondary.iter().copied()); result } pub(crate) fn get_attached(&self) -> &Option { &self.attached } pub(crate) fn get_secondary(&self) -> &Vec { &self.secondary } /// If the node is in use as the attached location, demote it into /// the list of secondary locations. This is used when a node goes offline, /// and we want to use a different node for attachment, but not permanently /// forget the location on the offline node. /// /// Returns true if a change was made pub(crate) fn demote_attached(&mut self, scheduler: &mut Scheduler, node_id: NodeId) -> bool { if self.attached == Some(node_id) { self.attached = None; self.secondary.push(node_id); scheduler.update_node_ref_counts(node_id, RefCountUpdate::DemoteAttached); true } else { false } } } impl Drop for IntentState { fn drop(&mut self) { // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler. // We do not check this while panicking, to avoid polluting unit test failures or // other assertions with this assertion's output. It's still wrong to leak these, // but if we already have a panic then we don't need to independently flag this case. if !(std::thread::panicking()) { debug_assert!(self.attached.is_none() && self.secondary.is_empty()); } } } #[derive(Default, Clone, Serialize, Deserialize, Debug)] pub(crate) struct ObservedState { pub(crate) locations: HashMap, } /// Our latest knowledge of how this tenant is configured in the outside world. /// /// Meaning: /// * No instance of this type exists for a node: we are certain that we have nothing configured on that /// node for this shard. /// * Instance exists with conf==None: we *might* have some state on that node, but we don't know /// what it is (e.g. we failed partway through configuring it) /// * Instance exists with conf==Some: this tells us what we last successfully configured on this node, /// and that configuration will still be present unless something external interfered. #[derive(Clone, Serialize, Deserialize, Debug)] pub(crate) struct ObservedStateLocation { /// If None, it means we do not know the status of this shard's location on this node, but /// we know that we might have some state on this node. pub(crate) conf: Option, } pub(crate) struct ReconcilerWaiter { // For observability purposes, remember the ID of the shard we're // waiting for. pub(crate) tenant_shard_id: TenantShardId, seq_wait: std::sync::Arc>, error_seq_wait: std::sync::Arc>, error: std::sync::Arc>>>, seq: Sequence, } pub(crate) enum ReconcilerStatus { Done, Failed, InProgress, } #[derive(thiserror::Error, Debug)] pub(crate) enum ReconcileWaitError { #[error("Timeout waiting for shard {0}")] Timeout(TenantShardId), #[error("shutting down")] Shutdown, #[error("Reconcile error on shard {0}: {1}")] Failed(TenantShardId, Arc), } #[derive(Eq, PartialEq, Debug, Clone)] pub(crate) struct ReplaceSecondary { old_node_id: NodeId, new_node_id: NodeId, } #[derive(Eq, PartialEq, Debug, Clone)] pub(crate) struct MigrateAttachment { pub(crate) old_attached_node_id: NodeId, pub(crate) new_attached_node_id: NodeId, } #[derive(Eq, PartialEq, Debug, Clone)] pub(crate) enum ScheduleOptimizationAction { // Replace one of our secondary locations with a different node ReplaceSecondary(ReplaceSecondary), // Migrate attachment to an existing secondary location MigrateAttachment(MigrateAttachment), } #[derive(Eq, PartialEq, Debug, Clone)] pub(crate) struct ScheduleOptimization { // What was the reconcile sequence when we generated this optimization? The optimization // should only be applied if the shard's sequence is still at this value, in case other changes // happened between planning the optimization and applying it. sequence: Sequence, pub(crate) action: ScheduleOptimizationAction, } impl ReconcilerWaiter { pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> { tokio::select! { result = self.seq_wait.wait_for_timeout(self.seq, timeout)=> { result.map_err(|e| match e { SeqWaitError::Timeout => ReconcileWaitError::Timeout(self.tenant_shard_id), SeqWaitError::Shutdown => ReconcileWaitError::Shutdown })?; }, result = self.error_seq_wait.wait_for(self.seq) => { result.map_err(|e| match e { SeqWaitError::Shutdown => ReconcileWaitError::Shutdown, SeqWaitError::Timeout => unreachable!() })?; return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone().expect("If error_seq_wait was advanced error was set").clone())) } } Ok(()) } pub(crate) fn get_status(&self) -> ReconcilerStatus { if self.seq_wait.would_wait_for(self.seq).is_ok() { ReconcilerStatus::Done } else if self.error_seq_wait.would_wait_for(self.seq).is_ok() { ReconcilerStatus::Failed } else { ReconcilerStatus::InProgress } } } /// Having spawned a reconciler task, the tenant shard's state will carry enough /// information to optionally cancel & await it later. pub(crate) struct ReconcilerHandle { sequence: Sequence, handle: JoinHandle<()>, cancel: CancellationToken, } pub(crate) enum ReconcileNeeded { /// shard either doesn't need reconciliation, or is forbidden from spawning a reconciler /// in its current state (e.g. shard split in progress, or ShardSchedulingPolicy forbids it) No, /// shard has a reconciler running, and its intent hasn't changed since that one was /// spawned: wait for the existing reconciler rather than spawning a new one. WaitExisting(ReconcilerWaiter), /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`] Yes, } /// Pending modification to the observed state of a tenant shard. /// Produced by [`Reconciler::observed_deltas`] and applied in [`crate::service::Service::process_result`]. pub(crate) enum ObservedStateDelta { Upsert(Box<(NodeId, ObservedStateLocation)>), Delete(NodeId), } impl ObservedStateDelta { pub(crate) fn node_id(&self) -> &NodeId { match self { Self::Upsert(up) => &up.0, Self::Delete(nid) => nid, } } } /// When a reconcile task completes, it sends this result object /// to be applied to the primary TenantShard. pub(crate) struct ReconcileResult { pub(crate) sequence: Sequence, /// On errors, `observed` should be treated as an incompleted description /// of state (i.e. any nodes present in the result should override nodes /// present in the parent tenant state, but any unmentioned nodes should /// not be removed from parent tenant state) pub(crate) result: Result<(), ReconcileError>, pub(crate) tenant_shard_id: TenantShardId, pub(crate) generation: Option, pub(crate) observed_deltas: Vec, /// Set [`TenantShard::pending_compute_notification`] from this flag pub(crate) pending_compute_notification: bool, } impl ObservedState { pub(crate) fn new() -> Self { Self { locations: HashMap::new(), } } pub(crate) fn is_empty(&self) -> bool { self.locations.is_empty() } } impl TenantShard { pub(crate) fn new( tenant_shard_id: TenantShardId, shard: ShardIdentity, policy: PlacementPolicy, preferred_az_id: Option, ) -> Self { metrics::METRICS_REGISTRY .metrics_group .storage_controller_tenant_shards .inc(); Self { tenant_shard_id, policy, intent: IntentState::default(), generation: Some(Generation::new(0)), shard, observed: ObservedState::default(), config: TenantConfig::default(), reconciler: None, splitting: SplitState::Idle, sequence: Sequence(1), delayed_reconcile: false, waiter: Arc::new(SeqWait::new(Sequence(0))), error_waiter: Arc::new(SeqWait::new(Sequence(0))), last_error: Arc::default(), pending_compute_notification: false, scheduling_policy: ShardSchedulingPolicy::default(), preferred_az_id, } } /// For use on startup when learning state from pageservers: generate my [`IntentState`] from my /// [`ObservedState`], even if it violates my [`PlacementPolicy`]. Call [`Self::schedule`] next, /// to get an intent state that complies with placement policy. The overall goal is to do scheduling /// in a way that makes use of any configured locations that already exist in the outside world. pub(crate) fn intent_from_observed(&mut self, scheduler: &mut Scheduler) { // Choose an attached location by filtering observed locations, and then sorting to get the highest // generation let mut attached_locs = self .observed .locations .iter() .filter_map(|(node_id, l)| { if let Some(conf) = &l.conf { if conf.mode == LocationConfigMode::AttachedMulti || conf.mode == LocationConfigMode::AttachedSingle || conf.mode == LocationConfigMode::AttachedStale { Some((node_id, conf.generation)) } else { None } } else { None } }) .collect::>(); attached_locs.sort_by_key(|i| i.1); if let Some((node_id, _gen)) = attached_locs.into_iter().last() { self.intent.set_attached(scheduler, Some(*node_id)); } // All remaining observed locations generate secondary intents. This includes None // observations, as these may well have some local content on disk that is usable (this // is an edge case that might occur if we restarted during a migration or other change) // // We may leave intent.attached empty if we didn't find any attached locations: [`Self::schedule`] // will take care of promoting one of these secondaries to be attached. self.observed.locations.keys().for_each(|node_id| { if Some(*node_id) != self.intent.attached { self.intent.push_secondary(scheduler, *node_id); } }); } /// Part of [`Self::schedule`] that is used to choose exactly one node to act as the /// attached pageserver for a shard. /// /// Returns whether we modified it, and the NodeId selected. fn schedule_attached( &mut self, scheduler: &mut Scheduler, context: &ScheduleContext, ) -> Result<(bool, NodeId), ScheduleError> { // No work to do if we already have an attached tenant if let Some(node_id) = self.intent.attached { return Ok((false, node_id)); } if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) { // Promote a secondary tracing::debug!("Promoted secondary {} to attached", promote_secondary); self.intent.promote_attached(scheduler, promote_secondary); Ok((true, promote_secondary)) } else { // Pick a fresh node: either we had no secondaries or none were schedulable let node_id = scheduler.schedule_shard::( &self.intent.secondary, &self.preferred_az_id, context, )?; tracing::debug!("Selected {} as attached", node_id); self.intent.set_attached(scheduler, Some(node_id)); Ok((true, node_id)) } } #[instrument(skip_all, fields( tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), sequence=%self.sequence ))] pub(crate) fn schedule( &mut self, scheduler: &mut Scheduler, context: &mut ScheduleContext, ) -> Result<(), ScheduleError> { let r = self.do_schedule(scheduler, context); context.avoid(&self.intent.all_pageservers()); if let Some(attached) = self.intent.get_attached() { context.push_attached(*attached); } r } pub(crate) fn do_schedule( &mut self, scheduler: &mut Scheduler, context: &ScheduleContext, ) -> Result<(), ScheduleError> { // TODO: before scheduling new nodes, check if any existing content in // self.intent refers to pageservers that are offline, and pick other // pageservers if so. // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not // change their attach location. match self.scheduling_policy { ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {} ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => { // Warn to make it obvious why other things aren't happening/working, if we skip scheduling tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), "Scheduling is disabled by policy {:?}", self.scheduling_policy); return Ok(()); } } // Build the set of pageservers already in use by this tenant, to avoid scheduling // more work on the same pageservers we're already using. let mut modified = false; // Add/remove nodes to fulfil policy use PlacementPolicy::*; match self.policy { Attached(secondary_count) => { let retain_secondaries = if self.intent.attached.is_none() && scheduler.node_preferred(&self.intent.secondary).is_some() { // If we have no attached, and one of the secondaries is elegible to be promoted, retain // one more secondary than we usually would, as one of them will become attached futher down this function. secondary_count + 1 } else { secondary_count }; while self.intent.secondary.len() > retain_secondaries { // We have no particular preference for one secondary location over another: just // arbitrarily drop from the end self.intent.pop_secondary(scheduler); modified = true; } // Should have exactly one attached, and N secondaries let (modified_attached, attached_node_id) = self.schedule_attached(scheduler, context)?; modified |= modified_attached; let mut used_pageservers = vec![attached_node_id]; while self.intent.secondary.len() < secondary_count { let node_id = scheduler.schedule_shard::( &used_pageservers, &self.preferred_az_id, context, )?; self.intent.push_secondary(scheduler, node_id); used_pageservers.push(node_id); modified = true; } } Secondary => { if let Some(node_id) = self.intent.get_attached() { // Populate secondary by demoting the attached node self.intent.demote_attached(scheduler, *node_id); modified = true; } else if self.intent.secondary.is_empty() { // Populate secondary by scheduling a fresh node let node_id = scheduler.schedule_shard::( &[], &self.preferred_az_id, context, )?; self.intent.push_secondary(scheduler, node_id); modified = true; } while self.intent.secondary.len() > 1 { // We have no particular preference for one secondary location over another: just // arbitrarily drop from the end self.intent.pop_secondary(scheduler); modified = true; } } Detached => { // Never add locations in this mode if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() { self.intent.clear(scheduler); modified = true; } } } if modified { self.sequence.0 += 1; } Ok(()) } /// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error /// if the swap is not possible and leaves the intent state in its original state. /// /// Arguments: /// `attached_to`: the currently attached location matching the intent state (may be None if the /// shard is not attached) /// `promote_to`: an optional secondary location of this tenant shard. If set to None, we ask /// the scheduler to recommend a node pub(crate) fn reschedule_to_secondary( &mut self, promote_to: Option, scheduler: &mut Scheduler, ) -> Result<(), ScheduleError> { let promote_to = match promote_to { Some(node) => node, None => match scheduler.node_preferred(self.intent.get_secondary()) { Some(node) => node, None => { return Err(ScheduleError::ImpossibleConstraint); } }, }; assert!(self.intent.get_secondary().contains(&promote_to)); if let Some(node) = self.intent.get_attached() { let demoted = self.intent.demote_attached(scheduler, *node); if !demoted { return Err(ScheduleError::ImpossibleConstraint); } } self.intent.promote_attached(scheduler, promote_to); // Increment the sequence number for the edge case where a // reconciler is already running to avoid waiting on the // current reconcile instead of spawning a new one. self.sequence = self.sequence.next(); Ok(()) } /// Optimize attachments: if a shard has a secondary location that is preferable to /// its primary location based on soft constraints, switch that secondary location /// to be attached. #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn optimize_attachment( &self, nodes: &HashMap, schedule_context: &ScheduleContext, ) -> Option { let attached = (*self.intent.get_attached())?; if self.intent.secondary.is_empty() { // We can only do useful work if we have both attached and secondary locations: this // function doesn't schedule new locations, only swaps between attached and secondaries. return None; } let current_affinity_score = schedule_context.get_node_affinity(attached); let current_attachment_count = schedule_context.get_node_attachments(attached); // Generate score for each node, dropping any un-schedulable nodes. let all_pageservers = self.intent.all_pageservers(); let mut scores = all_pageservers .iter() .flat_map(|node_id| { let node = nodes.get(node_id); if node.is_none() { None } else if matches!( node.unwrap().get_scheduling(), NodeSchedulingPolicy::Filling ) { // If the node is currently filling, don't count it as a candidate to avoid, // racing with the background fill. None } else if matches!(node.unwrap().may_schedule(), MaySchedule::No) { None } else { let affinity_score = schedule_context.get_node_affinity(*node_id); let attachment_count = schedule_context.get_node_attachments(*node_id); Some((*node_id, affinity_score, attachment_count)) } }) .collect::>(); // Sort precedence: // 1st - prefer nodes with the lowest total affinity score // 2nd - prefer nodes with the lowest number of attachments in this context // 3rd - if all else is equal, sort by node ID for determinism in tests. scores.sort_by_key(|i| (i.1, i.2, i.0)); if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) = scores.first() { if attached != *preferred_node { // The best alternative must be more than 1 better than us, otherwise we could end // up flapping back next time we're called (e.g. there's no point migrating from // a location with score 1 to a score zero, because on next location the situation // would be the same, but in reverse). if current_affinity_score > *preferred_affinity_score + AffinityScore(1) || current_attachment_count > *preferred_attachment_count + 1 { tracing::info!( "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})", self.intent.get_secondary() ); return Some(ScheduleOptimization { sequence: self.sequence, action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { old_attached_node_id: attached, new_attached_node_id: *preferred_node, }), }); } } else { tracing::debug!( "Node {} is already preferred (score {:?})", preferred_node, preferred_affinity_score ); } } // Fall-through: we didn't find an optimization None } #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn optimize_secondary( &self, scheduler: &mut Scheduler, schedule_context: &ScheduleContext, ) -> Option { if self.intent.secondary.is_empty() { // We can only do useful work if we have both attached and secondary locations: this // function doesn't schedule new locations, only swaps between attached and secondaries. return None; } for secondary in self.intent.get_secondary() { let Some(affinity_score) = schedule_context.nodes.get(secondary) else { // We're already on a node unaffected any affinity constraints, // so we won't change it. continue; }; // Let the scheduler suggest a node, where it would put us if we were scheduling afresh // This implicitly limits the choice to nodes that are available, and prefers nodes // with lower utilization. let Ok(candidate_node) = scheduler.schedule_shard::( &self.intent.all_pageservers(), &self.preferred_az_id, schedule_context, ) else { // A scheduling error means we have no possible candidate replacements continue; }; let candidate_affinity_score = schedule_context .nodes .get(&candidate_node) .unwrap_or(&AffinityScore::FREE); // The best alternative must be more than 1 better than us, otherwise we could end // up flapping back next time we're called. if *candidate_affinity_score + AffinityScore(1) < *affinity_score { // If some other node is available and has a lower score than this node, then // that other node is a good place to migrate to. tracing::info!( "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})", self.intent.get_secondary() ); return Some(ScheduleOptimization { sequence: self.sequence, action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { old_node_id: *secondary, new_node_id: candidate_node, }), }); } } None } /// Return true if the optimization was really applied: it will not be applied if the optimization's /// sequence is behind this tenant shard's pub(crate) fn apply_optimization( &mut self, scheduler: &mut Scheduler, optimization: ScheduleOptimization, ) -> bool { if optimization.sequence != self.sequence { return false; } metrics::METRICS_REGISTRY .metrics_group .storage_controller_schedule_optimization .inc(); match optimization.action { ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { old_attached_node_id, new_attached_node_id, }) => { self.intent.demote_attached(scheduler, old_attached_node_id); self.intent .promote_attached(scheduler, new_attached_node_id); } ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { old_node_id, new_node_id, }) => { self.intent.remove_secondary(scheduler, old_node_id); self.intent.push_secondary(scheduler, new_node_id); } } true } /// Query whether the tenant's observed state for attached node matches its intent state, and if so, /// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there. /// /// Reconciliation may still be needed for other aspects of state such as secondaries (see [`Self::dirty`]): this /// funciton should not be used to decide whether to reconcile. pub(crate) fn stably_attached(&self) -> Option { if let Some(attach_intent) = self.intent.attached { match self.observed.locations.get(&attach_intent) { Some(loc) => match &loc.conf { Some(conf) => match conf.mode { LocationConfigMode::AttachedMulti | LocationConfigMode::AttachedSingle | LocationConfigMode::AttachedStale => { // Our intent and observed state agree that this node is in an attached state. Some(attach_intent) } // Our observed config is not an attached state _ => None, }, // Our observed state is None, i.e. in flux None => None, }, // We have no observed state for this node None => None, } } else { // Our intent is not to attach None } } fn dirty(&self, nodes: &Arc>) -> bool { let mut dirty_nodes = HashSet::new(); if let Some(node_id) = self.intent.attached { // Maybe panic: it is a severe bug if we try to attach while generation is null. let generation = self .generation .expect("Attempted to enter attached state without a generation"); let wanted_conf = attached_location_conf(generation, &self.shard, &self.config, &self.policy); match self.observed.locations.get(&node_id) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} Some(_) | None => { dirty_nodes.insert(node_id); } } } for node_id in &self.intent.secondary { let wanted_conf = secondary_location_conf(&self.shard, &self.config); match self.observed.locations.get(node_id) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} Some(_) | None => { dirty_nodes.insert(*node_id); } } } for node_id in self.observed.locations.keys() { if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) { // We have observed state that isn't part of our intent: need to clean it up. dirty_nodes.insert(*node_id); } } dirty_nodes.retain(|node_id| { nodes .get(node_id) .map(|n| n.is_available()) .unwrap_or(false) }); !dirty_nodes.is_empty() } #[allow(clippy::too_many_arguments)] #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn get_reconcile_needed( &mut self, pageservers: &Arc>, ) -> ReconcileNeeded { // If there are any ambiguous observed states, and the nodes they refer to are available, // we should reconcile to clean them up. let mut dirty_observed = false; for (node_id, observed_loc) in &self.observed.locations { let node = pageservers .get(node_id) .expect("Nodes may not be removed while referenced"); if observed_loc.conf.is_none() && node.is_available() { dirty_observed = true; break; } } let active_nodes_dirty = self.dirty(pageservers); // Even if there is no pageserver work to be done, if we have a pending notification to computes, // wake up a reconciler to send it. let do_reconcile = active_nodes_dirty || dirty_observed || self.pending_compute_notification; if !do_reconcile { tracing::debug!("Not dirty, no reconciliation needed."); return ReconcileNeeded::No; } // If we are currently splitting, then never start a reconciler task: the splitting logic // requires that shards are not interfered with while it runs. Do this check here rather than // up top, so that we only log this message if we would otherwise have done a reconciliation. if !matches!(self.splitting, SplitState::Idle) { tracing::info!("Refusing to reconcile, splitting in progress"); return ReconcileNeeded::No; } // Reconcile already in flight for the current sequence? if let Some(handle) = &self.reconciler { if handle.sequence == self.sequence { tracing::info!( "Reconciliation already in progress for sequence {:?}", self.sequence, ); return ReconcileNeeded::WaitExisting(ReconcilerWaiter { tenant_shard_id: self.tenant_shard_id, seq_wait: self.waiter.clone(), error_seq_wait: self.error_waiter.clone(), error: self.last_error.clone(), seq: self.sequence, }); } } // Pre-checks done: finally check whether we may actually do the work match self.scheduling_policy { ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential | ShardSchedulingPolicy::Pause => {} ShardSchedulingPolicy::Stop => { // We only reach this point if there is work to do and we're going to skip // doing it: warn it obvious why this tenant isn't doing what it ought to. tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy); return ReconcileNeeded::No; } } ReconcileNeeded::Yes } /// Ensure the sequence number is set to a value where waiting for this value will make us wait /// for the next reconcile: i.e. it is ahead of all completed or running reconcilers. /// /// Constructing a ReconcilerWaiter with the resulting sequence number gives the property /// that the waiter will not complete until some future Reconciler is constructed and run. fn ensure_sequence_ahead(&mut self) { // Find the highest sequence for which a Reconciler has previously run or is currently // running let max_seen = std::cmp::max( self.reconciler .as_ref() .map(|r| r.sequence) .unwrap_or(Sequence(0)), std::cmp::max(self.waiter.load(), self.error_waiter.load()), ); if self.sequence <= max_seen { self.sequence = max_seen.next(); } } /// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet. /// /// This is appropriate when you can't spawn a reconciler (e.g. due to resource limits), but /// you would like to wait on the next reconciler that gets spawned in the background. pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter { self.ensure_sequence_ahead(); ReconcilerWaiter { tenant_shard_id: self.tenant_shard_id, seq_wait: self.waiter.clone(), error_seq_wait: self.error_waiter.clone(), error: self.last_error.clone(), seq: self.sequence, } } async fn reconcile( sequence: Sequence, mut reconciler: Reconciler, must_notify: bool, ) -> ReconcileResult { // Attempt to make observed state match intent state let result = reconciler.reconcile().await; // If we know we had a pending compute notification from some previous action, send a notification irrespective // of whether the above reconcile() did any work. It has to be Ok() though, because otherwise we might be // sending a notification of a location that isn't really attached. if result.is_ok() && must_notify { // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`] reconciler.compute_notify().await.ok(); } else if must_notify { // Carry this flag so that the reconciler's result will indicate that it still needs to retry // the compute hook notification eventually. reconciler.compute_notify_failure = true; } // Update result counter let outcome_label = match &result { Ok(_) => ReconcileOutcome::Success, Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel, Err(_) => ReconcileOutcome::Error, }; metrics::METRICS_REGISTRY .metrics_group .storage_controller_reconcile_complete .inc(ReconcileCompleteLabelGroup { status: outcome_label, }); // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might // try and schedule more work in response to our result. ReconcileResult { sequence, result, tenant_shard_id: reconciler.tenant_shard_id, generation: reconciler.generation, observed_deltas: reconciler.observed_deltas(), pending_compute_notification: reconciler.compute_notify_failure, } } #[allow(clippy::too_many_arguments)] #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn spawn_reconciler( &mut self, result_tx: &tokio::sync::mpsc::UnboundedSender, pageservers: &Arc>, compute_hook: &Arc, reconciler_config: ReconcilerConfig, service_config: &service::Config, persistence: &Arc, units: ReconcileUnits, gate_guard: GateGuard, cancel: &CancellationToken, ) -> Option { // Reconcile in flight for a stale sequence? Our sequence's task will wait for it before // doing our sequence's work. let old_handle = self.reconciler.take(); // Build list of nodes from which the reconciler should detach let mut detach = Vec::new(); for node_id in self.observed.locations.keys() { if self.intent.get_attached() != &Some(*node_id) && !self.intent.secondary.contains(node_id) { detach.push( pageservers .get(node_id) .expect("Intent references non-existent pageserver") .clone(), ) } } // Advance the sequence before spawning a reconciler, so that sequence waiters // can distinguish between before+after the reconcile completes. self.ensure_sequence_ahead(); let reconciler_cancel = cancel.child_token(); let reconciler_intent = TargetState::from_intent(pageservers, &self.intent); let reconciler = Reconciler { tenant_shard_id: self.tenant_shard_id, shard: self.shard, placement_policy: self.policy.clone(), generation: self.generation, intent: reconciler_intent, detach, reconciler_config, config: self.config.clone(), preferred_az: self.preferred_az_id.clone(), observed: self.observed.clone(), original_observed: self.observed.clone(), compute_hook: compute_hook.clone(), service_config: service_config.clone(), _gate_guard: gate_guard, _resource_units: units, cancel: reconciler_cancel.clone(), persistence: persistence.clone(), compute_notify_failure: false, }; let reconcile_seq = self.sequence; let long_reconcile_threshold = service_config.long_reconcile_threshold; tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence); let must_notify = self.pending_compute_notification; let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq, tenant_id=%reconciler.tenant_shard_id.tenant_id, shard_id=%reconciler.tenant_shard_id.shard_slug()); metrics::METRICS_REGISTRY .metrics_group .storage_controller_reconcile_spawn .inc(); let result_tx = result_tx.clone(); let join_handle = tokio::task::spawn( async move { // Wait for any previous reconcile task to complete before we start if let Some(old_handle) = old_handle { old_handle.cancel.cancel(); if let Err(e) = old_handle.handle.await { // We can't do much with this other than log it: the task is done, so // we may proceed with our work. tracing::error!("Unexpected join error waiting for reconcile task: {e}"); } } // Early check for cancellation before doing any work // TODO: wrap all remote API operations in cancellation check // as well. if reconciler.cancel.is_cancelled() { metrics::METRICS_REGISTRY .metrics_group .storage_controller_reconcile_complete .inc(ReconcileCompleteLabelGroup { status: ReconcileOutcome::Cancel, }); return; } let (tenant_id_label, shard_number_label, sequence_label) = { ( reconciler.tenant_shard_id.tenant_id.to_string(), reconciler.tenant_shard_id.shard_number.0.to_string(), reconcile_seq.to_string(), ) }; let label_group = ReconcileLongRunningLabelGroup { tenant_id: &tenant_id_label, shard_number: &shard_number_label, sequence: &sequence_label, }; let reconcile_fut = Self::reconcile(reconcile_seq, reconciler, must_notify); let long_reconcile_fut = { let label_group = label_group.clone(); async move { tokio::time::sleep(long_reconcile_threshold).await; tracing::warn!("Reconcile passed the long running threshold of {long_reconcile_threshold:?}"); metrics::METRICS_REGISTRY .metrics_group .storage_controller_reconcile_long_running .inc(label_group); } }; let reconcile_fut = std::pin::pin!(reconcile_fut); let long_reconcile_fut = std::pin::pin!(long_reconcile_fut); let (was_long, result) = match future::select(reconcile_fut, long_reconcile_fut).await { Either::Left((reconcile_result, _)) => (false, reconcile_result), Either::Right((_, reconcile_fut)) => (true, reconcile_fut.await), }; if was_long { let id = metrics::METRICS_REGISTRY .metrics_group .storage_controller_reconcile_long_running .with_labels(label_group); metrics::METRICS_REGISTRY .metrics_group .storage_controller_reconcile_long_running .remove_metric(id); } result_tx .send(ReconcileResultRequest::ReconcileResult(result)) .ok(); } .instrument(reconciler_span), ); self.reconciler = Some(ReconcilerHandle { sequence: self.sequence, handle: join_handle, cancel: reconciler_cancel, }); Some(ReconcilerWaiter { tenant_shard_id: self.tenant_shard_id, seq_wait: self.waiter.clone(), error_seq_wait: self.error_waiter.clone(), error: self.last_error.clone(), seq: self.sequence, }) } pub(crate) fn cancel_reconciler(&self) { if let Some(handle) = self.reconciler.as_ref() { handle.cancel.cancel() } } /// Get a waiter for any reconciliation in flight, but do not start reconciliation /// if it is not already running pub(crate) fn get_waiter(&self) -> Option { if self.reconciler.is_some() { Some(ReconcilerWaiter { tenant_shard_id: self.tenant_shard_id, seq_wait: self.waiter.clone(), error_seq_wait: self.error_waiter.clone(), error: self.last_error.clone(), seq: self.sequence, }) } else { None } } /// Called when a ReconcileResult has been emitted and the service is updating /// our state: if the result is from a sequence >= my ReconcileHandle, then drop /// the handle to indicate there is no longer a reconciliation in progress. pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) { if let Some(reconcile_handle) = &self.reconciler { if reconcile_handle.sequence <= sequence { self.reconciler = None; } } } /// If we had any state at all referring to this node ID, drop it. Does not /// attempt to reschedule. /// /// Returns true if we modified the node's intent state. pub(crate) fn deref_node(&mut self, node_id: NodeId) -> bool { let mut intent_modified = false; // Drop if this node was our attached intent if self.intent.attached == Some(node_id) { self.intent.attached = None; intent_modified = true; } // Drop from the list of secondaries, and check if we modified it let had_secondaries = self.intent.secondary.len(); self.intent.secondary.retain(|n| n != &node_id); intent_modified |= self.intent.secondary.len() != had_secondaries; debug_assert!(!self.intent.all_pageservers().contains(&node_id)); intent_modified } pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) { self.scheduling_policy = p; } pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy { &self.scheduling_policy } pub(crate) fn set_last_error(&mut self, sequence: Sequence, error: ReconcileError) { // Ordering: always set last_error before advancing sequence, so that sequence // waiters are guaranteed to see a Some value when they see an error. *(self.last_error.lock().unwrap()) = Some(Arc::new(error)); self.error_waiter.advance(sequence); } pub(crate) fn from_persistent( tsp: TenantShardPersistence, intent: IntentState, ) -> anyhow::Result { let tenant_shard_id = tsp.get_tenant_shard_id()?; let shard_identity = tsp.get_shard_identity()?; metrics::METRICS_REGISTRY .metrics_group .storage_controller_tenant_shards .inc(); Ok(Self { tenant_shard_id, shard: shard_identity, sequence: Sequence::initial(), generation: tsp.generation.map(|g| Generation::new(g as u32)), policy: serde_json::from_str(&tsp.placement_policy).unwrap(), intent, observed: ObservedState::new(), config: serde_json::from_str(&tsp.config).unwrap(), reconciler: None, splitting: tsp.splitting, waiter: Arc::new(SeqWait::new(Sequence::initial())), error_waiter: Arc::new(SeqWait::new(Sequence::initial())), last_error: Arc::default(), pending_compute_notification: false, delayed_reconcile: false, scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), preferred_az_id: tsp.preferred_az_id.map(AvailabilityZone), }) } pub(crate) fn to_persistent(&self) -> TenantShardPersistence { TenantShardPersistence { tenant_id: self.tenant_shard_id.tenant_id.to_string(), shard_number: self.tenant_shard_id.shard_number.0 as i32, shard_count: self.tenant_shard_id.shard_count.literal() as i32, shard_stripe_size: self.shard.stripe_size.0 as i32, generation: self.generation.map(|g| g.into().unwrap_or(0) as i32), generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64), placement_policy: serde_json::to_string(&self.policy).unwrap(), config: serde_json::to_string(&self.config).unwrap(), splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(), preferred_az_id: self.preferred_az_id.as_ref().map(|az| az.0.clone()), } } pub(crate) fn preferred_az(&self) -> Option<&AvailabilityZone> { self.preferred_az_id.as_ref() } pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) { self.preferred_az_id = Some(preferred_az_id); } /// Returns all the nodes to which this tenant shard is attached according to the /// observed state and the generations. Return vector is sorted from latest generation /// to earliest. pub(crate) fn attached_locations(&self) -> Vec<(NodeId, Generation)> { self.observed .locations .iter() .filter_map(|(node_id, observed)| { use LocationConfigMode::{AttachedMulti, AttachedSingle, AttachedStale}; let conf = observed.conf.as_ref()?; match (conf.generation, conf.mode) { (Some(gen), AttachedMulti | AttachedSingle | AttachedStale) => { Some((*node_id, gen)) } _ => None, } }) .sorted_by(|(_lhs_node_id, lhs_gen), (_rhs_node_id, rhs_gen)| { lhs_gen.cmp(rhs_gen).reverse() }) .map(|(node_id, gen)| (node_id, Generation::new(gen))) .collect() } /// Update the observed state of the tenant by applying incremental deltas /// /// Deltas are generated by reconcilers via [`Reconciler::observed_deltas`]. /// They are then filtered in [`crate::service::Service::process_result`]. pub(crate) fn apply_observed_deltas( &mut self, deltas: impl Iterator, ) { for delta in deltas { match delta { ObservedStateDelta::Upsert(ups) => { let (node_id, loc) = *ups; // If the generation of the observed location in the delta is lagging // behind the current one, then we have a race condition and cannot // be certain about the true observed state. Set the observed state // to None in order to reflect this. let crnt_gen = self .observed .locations .get(&node_id) .and_then(|loc| loc.conf.as_ref()) .and_then(|conf| conf.generation); let new_gen = loc.conf.as_ref().and_then(|conf| conf.generation); match (crnt_gen, new_gen) { (Some(crnt), Some(new)) if crnt_gen > new_gen => { tracing::warn!( "Skipping observed state update {}: {:?} and using None due to stale generation ({} > {})", node_id, loc, crnt, new ); self.observed .locations .insert(node_id, ObservedStateLocation { conf: None }); continue; } _ => {} } if let Some(conf) = &loc.conf { tracing::info!("Updating observed location {}: {:?}", node_id, conf); } else { tracing::info!("Setting observed location {} to None", node_id,) } self.observed.locations.insert(node_id, loc); } ObservedStateDelta::Delete(node_id) => { tracing::info!("Deleting observed location {}", node_id); self.observed.locations.remove(&node_id); } } } } } impl Drop for TenantShard { fn drop(&mut self) { metrics::METRICS_REGISTRY .metrics_group .storage_controller_tenant_shards .dec(); } } #[cfg(test)] pub(crate) mod tests { use std::{cell::RefCell, rc::Rc}; use pageserver_api::{ controller_api::NodeAvailability, shard::{ShardCount, ShardNumber}, }; use rand::{rngs::StdRng, SeedableRng}; use utils::id::TenantId; use crate::scheduler::test_utils::make_test_nodes; use super::*; fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard { let tenant_id = TenantId::generate(); let shard_number = ShardNumber(0); let shard_count = ShardCount::new(1); let tenant_shard_id = TenantShardId { tenant_id, shard_number, shard_count, }; TenantShard::new( tenant_shard_id, ShardIdentity::new( shard_number, shard_count, pageserver_api::shard::ShardStripeSize(32768), ) .unwrap(), policy, None, ) } pub(crate) fn make_test_tenant( policy: PlacementPolicy, shard_count: ShardCount, preferred_az: Option, ) -> Vec { make_test_tenant_with_id(TenantId::generate(), policy, shard_count, preferred_az) } pub(crate) fn make_test_tenant_with_id( tenant_id: TenantId, policy: PlacementPolicy, shard_count: ShardCount, preferred_az: Option, ) -> Vec { (0..shard_count.count()) .map(|i| { let shard_number = ShardNumber(i); let tenant_shard_id = TenantShardId { tenant_id, shard_number, shard_count, }; TenantShard::new( tenant_shard_id, ShardIdentity::new( shard_number, shard_count, pageserver_api::shard::ShardStripeSize(32768), ) .unwrap(), policy.clone(), preferred_az.clone(), ) }) .collect() } /// Test the scheduling behaviors used when a tenant configured for HA is subject /// to nodes being marked offline. #[test] fn tenant_ha_scheduling() -> anyhow::Result<()> { // Start with three nodes. Our tenant will only use two. The third one is // expected to remain unused. let mut nodes = make_test_nodes(3, &[]); let mut scheduler = Scheduler::new(nodes.values()); let mut context = ScheduleContext::default(); let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); tenant_shard .schedule(&mut scheduler, &mut context) .expect("we have enough nodes, scheduling should work"); // Expect to initially be schedule on to different nodes assert_eq!(tenant_shard.intent.secondary.len(), 1); assert!(tenant_shard.intent.attached.is_some()); let attached_node_id = tenant_shard.intent.attached.unwrap(); let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap(); assert_ne!(attached_node_id, secondary_node_id); // Notifying the attached node is offline should demote it to a secondary let changed = tenant_shard .intent .demote_attached(&mut scheduler, attached_node_id); assert!(changed); assert!(tenant_shard.intent.attached.is_none()); assert_eq!(tenant_shard.intent.secondary.len(), 2); // Update the scheduler state to indicate the node is offline nodes .get_mut(&attached_node_id) .unwrap() .set_availability(NodeAvailability::Offline); scheduler.node_upsert(nodes.get(&attached_node_id).unwrap()); // Scheduling the node should promote the still-available secondary node to attached tenant_shard .schedule(&mut scheduler, &mut context) .expect("active nodes are available"); assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id); // The original attached node should have been retained as a secondary assert_eq!( *tenant_shard.intent.secondary.iter().last().unwrap(), attached_node_id ); tenant_shard.intent.clear(&mut scheduler); Ok(()) } #[test] fn intent_from_observed() -> anyhow::Result<()> { let nodes = make_test_nodes(3, &[]); let mut scheduler = Scheduler::new(nodes.values()); let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); tenant_shard.observed.locations.insert( NodeId(3), ObservedStateLocation { conf: Some(LocationConfig { mode: LocationConfigMode::AttachedMulti, generation: Some(2), secondary_conf: None, shard_number: tenant_shard.shard.number.0, shard_count: tenant_shard.shard.count.literal(), shard_stripe_size: tenant_shard.shard.stripe_size.0, tenant_conf: TenantConfig::default(), }), }, ); tenant_shard.observed.locations.insert( NodeId(2), ObservedStateLocation { conf: Some(LocationConfig { mode: LocationConfigMode::AttachedStale, generation: Some(1), secondary_conf: None, shard_number: tenant_shard.shard.number.0, shard_count: tenant_shard.shard.count.literal(), shard_stripe_size: tenant_shard.shard.stripe_size.0, tenant_conf: TenantConfig::default(), }), }, ); tenant_shard.intent_from_observed(&mut scheduler); // The highest generationed attached location gets used as attached assert_eq!(tenant_shard.intent.attached, Some(NodeId(3))); // Other locations get used as secondary assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]); scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?; tenant_shard.intent.clear(&mut scheduler); Ok(()) } #[test] fn scheduling_mode() -> anyhow::Result<()> { let nodes = make_test_nodes(3, &[]); let mut scheduler = Scheduler::new(nodes.values()); let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); // In pause mode, schedule() shouldn't do anything tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause; assert!(tenant_shard .schedule(&mut scheduler, &mut ScheduleContext::default()) .is_ok()); assert!(tenant_shard.intent.all_pageservers().is_empty()); // In active mode, schedule() works tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active; assert!(tenant_shard .schedule(&mut scheduler, &mut ScheduleContext::default()) .is_ok()); assert!(!tenant_shard.intent.all_pageservers().is_empty()); tenant_shard.intent.clear(&mut scheduler); Ok(()) } #[test] fn optimize_attachment() -> anyhow::Result<()> { let nodes = make_test_nodes(3, &[]); let mut scheduler = Scheduler::new(nodes.values()); let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); // Initially: both nodes attached on shard 1, and both have secondary locations // on different nodes. shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1))); shard_a.intent.push_secondary(&mut scheduler, NodeId(2)); shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1))); shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); let mut schedule_context = ScheduleContext::default(); schedule_context.avoid(&shard_a.intent.all_pageservers()); schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); schedule_context.avoid(&shard_b.intent.all_pageservers()); schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context); // Either shard should recognize that it has the option to switch to a secondary location where there // would be no other shards from the same tenant, and request to do so. assert_eq!( optimization_a, Some(ScheduleOptimization { sequence: shard_a.sequence, action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { old_attached_node_id: NodeId(1), new_attached_node_id: NodeId(2) }) }) ); // Note that these optimizing two shards in the same tenant with the same ScheduleContext is // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility // of [`Service::optimize_all`] to avoid trying // to do optimizations for multiple shards in the same tenant at the same time. Generating // both optimizations is just done for test purposes let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context); assert_eq!( optimization_b, Some(ScheduleOptimization { sequence: shard_b.sequence, action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { old_attached_node_id: NodeId(1), new_attached_node_id: NodeId(3) }) }) ); // Applying these optimizations should result in the end state proposed shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap()); assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2))); assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]); shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap()); assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3))); assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]); shard_a.intent.clear(&mut scheduler); shard_b.intent.clear(&mut scheduler); Ok(()) } #[test] fn optimize_secondary() -> anyhow::Result<()> { let nodes = make_test_nodes(4, &[]); let mut scheduler = Scheduler::new(nodes.values()); let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); // Initially: both nodes attached on shard 1, and both have secondary locations // on different nodes. shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1))); shard_a.intent.push_secondary(&mut scheduler, NodeId(3)); shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2))); shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); let mut schedule_context = ScheduleContext::default(); schedule_context.avoid(&shard_a.intent.all_pageservers()); schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); schedule_context.avoid(&shard_b.intent.all_pageservers()); schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context); // Since there is a node with no locations available, the node with two locations for the // same tenant should generate an optimization to move one away assert_eq!( optimization_a, Some(ScheduleOptimization { sequence: shard_a.sequence, action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { old_node_id: NodeId(3), new_node_id: NodeId(4) }) }) ); shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap()); assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1))); assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]); shard_a.intent.clear(&mut scheduler); shard_b.intent.clear(&mut scheduler); Ok(()) } // Optimize til quiescent: this emulates what Service::optimize_all does, when // called repeatedly in the background. // Returns the applied optimizations fn optimize_til_idle( nodes: &HashMap, scheduler: &mut Scheduler, shards: &mut [TenantShard], ) -> Vec { let mut loop_n = 0; let mut optimizations = Vec::default(); loop { let mut schedule_context = ScheduleContext::default(); let mut any_changed = false; for shard in shards.iter() { schedule_context.avoid(&shard.intent.all_pageservers()); if let Some(attached) = shard.intent.get_attached() { schedule_context.push_attached(*attached); } } for shard in shards.iter_mut() { let optimization = shard.optimize_attachment(nodes, &schedule_context); if let Some(optimization) = optimization { optimizations.push(optimization.clone()); shard.apply_optimization(scheduler, optimization); any_changed = true; break; } let optimization = shard.optimize_secondary(scheduler, &schedule_context); if let Some(optimization) = optimization { optimizations.push(optimization.clone()); shard.apply_optimization(scheduler, optimization); any_changed = true; break; } } if !any_changed { break; } // Assert no infinite loop loop_n += 1; assert!(loop_n < 1000); } optimizations } /// Test the balancing behavior of shard scheduling: that it achieves a balance, and /// that it converges. #[test] fn optimize_add_nodes() -> anyhow::Result<()> { let nodes = make_test_nodes(4, &[]); // Only show the scheduler a couple of nodes let mut scheduler = Scheduler::new([].iter()); scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap()); scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap()); let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None); let mut schedule_context = ScheduleContext::default(); for shard in &mut shards { assert!(shard .schedule(&mut scheduler, &mut schedule_context) .is_ok()); } // We should see equal number of locations on the two nodes. assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4); assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2); assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4); assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2); // Add another two nodes: we should see the shards spread out when their optimize // methods are called scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap()); scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap()); optimize_til_idle(&nodes, &mut scheduler, &mut shards); assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2); assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1); assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2); assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1); assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2); assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 1); assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2); assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 1); for shard in shards.iter_mut() { shard.intent.clear(&mut scheduler); } Ok(()) } /// Test that initial shard scheduling is optimal. By optimal we mean /// that the optimizer cannot find a way to improve it. /// /// This test is an example of the scheduling issue described in /// https://github.com/neondatabase/neon/issues/8969 #[test] fn initial_scheduling_is_optimal() -> anyhow::Result<()> { use itertools::Itertools; let nodes = make_test_nodes(2, &[]); let mut scheduler = Scheduler::new([].iter()); scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap()); scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap()); let mut a = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None); let a_context = Rc::new(RefCell::new(ScheduleContext::default())); let mut b = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None); let b_context = Rc::new(RefCell::new(ScheduleContext::default())); let a_shards_with_context = a.iter_mut().map(|shard| (shard, a_context.clone())); let b_shards_with_context = b.iter_mut().map(|shard| (shard, b_context.clone())); let schedule_order = a_shards_with_context.interleave(b_shards_with_context); for (shard, context) in schedule_order { let context = &mut *context.borrow_mut(); shard.schedule(&mut scheduler, context).unwrap(); } let applied_to_a = optimize_til_idle(&nodes, &mut scheduler, &mut a); assert_eq!(applied_to_a, vec![]); let applied_to_b = optimize_til_idle(&nodes, &mut scheduler, &mut b); assert_eq!(applied_to_b, vec![]); for shard in a.iter_mut().chain(b.iter_mut()) { shard.intent.clear(&mut scheduler); } Ok(()) } #[test] fn random_az_shard_scheduling() -> anyhow::Result<()> { use rand::seq::SliceRandom; for seed in 0..50 { eprintln!("Running test with seed {seed}"); let mut rng = StdRng::seed_from_u64(seed); let az_a_tag = AvailabilityZone("az-a".to_string()); let az_b_tag = AvailabilityZone("az-b".to_string()); let azs = [az_a_tag, az_b_tag]; let nodes = make_test_nodes(4, &azs); let mut shards_per_az: HashMap = HashMap::new(); let mut scheduler = Scheduler::new([].iter()); for node in nodes.values() { scheduler.node_upsert(node); } let mut shards = Vec::default(); let mut contexts = Vec::default(); let mut az_picker = azs.iter().cycle().cloned(); for i in 0..100 { let az = az_picker.next().unwrap(); let shard_count = i % 4 + 1; *shards_per_az.entry(az.clone()).or_default() += shard_count; let tenant_shards = make_test_tenant( PlacementPolicy::Attached(1), ShardCount::new(shard_count.try_into().unwrap()), Some(az), ); let context = Rc::new(RefCell::new(ScheduleContext::default())); contexts.push(context.clone()); let with_ctx = tenant_shards .into_iter() .map(|shard| (shard, context.clone())); for shard_with_ctx in with_ctx { shards.push(shard_with_ctx); } } shards.shuffle(&mut rng); #[derive(Default, Debug)] struct NodeStats { attachments: u32, secondaries: u32, } let mut node_stats: HashMap = HashMap::default(); let mut attachments_in_wrong_az = 0; let mut secondaries_in_wrong_az = 0; for (shard, context) in &mut shards { let context = &mut *context.borrow_mut(); shard.schedule(&mut scheduler, context).unwrap(); let attached_node = shard.intent.get_attached().unwrap(); let stats = node_stats.entry(attached_node).or_default(); stats.attachments += 1; let secondary_node = *shard.intent.get_secondary().first().unwrap(); let stats = node_stats.entry(secondary_node).or_default(); stats.secondaries += 1; let attached_node_az = nodes .get(&attached_node) .unwrap() .get_availability_zone_id(); let secondary_node_az = nodes .get(&secondary_node) .unwrap() .get_availability_zone_id(); let preferred_az = shard.preferred_az().unwrap(); if attached_node_az != preferred_az { eprintln!( "{} attachment was scheduled in AZ {} but preferred AZ {}", shard.tenant_shard_id, attached_node_az, preferred_az ); attachments_in_wrong_az += 1; } if secondary_node_az == preferred_az { eprintln!( "{} secondary was scheduled in AZ {} which matches preference", shard.tenant_shard_id, attached_node_az ); secondaries_in_wrong_az += 1; } } let mut violations = Vec::default(); if attachments_in_wrong_az > 0 { violations.push(format!( "{} attachments scheduled to the incorrect AZ", attachments_in_wrong_az )); } if secondaries_in_wrong_az > 0 { violations.push(format!( "{} secondaries scheduled to the incorrect AZ", secondaries_in_wrong_az )); } eprintln!( "attachments_in_wrong_az={} secondaries_in_wrong_az={}", attachments_in_wrong_az, secondaries_in_wrong_az ); for (node_id, stats) in &node_stats { let node_az = nodes.get(node_id).unwrap().get_availability_zone_id(); let ideal_attachment_load = shards_per_az.get(node_az).unwrap() / 2; let allowed_attachment_load = (ideal_attachment_load - 1)..(ideal_attachment_load + 2); if !allowed_attachment_load.contains(&stats.attachments) { violations.push(format!( "Found {} attachments on node {}, but expected {}", stats.attachments, node_id, ideal_attachment_load )); } eprintln!( "{}: attachments={} secondaries={} ideal_attachment_load={}", node_id, stats.attachments, stats.secondaries, ideal_attachment_load ); } assert!(violations.is_empty(), "{violations:?}"); for (mut shard, _ctx) in shards { shard.intent.clear(&mut scheduler); } } Ok(()) } }