diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index ca55d6c593..d84803733a 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -9,7 +9,9 @@ use std::time::Duration; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; -use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT}; +use storage_controller::service::{ + Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, +}; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; use utils::auth::{JwtAuth, SwappableJwtAuth}; @@ -64,6 +66,10 @@ struct Cli { /// Grace period before marking unresponsive pageserver offline #[arg(long)] max_unavailable_interval: Option, + + /// Maximum number of reconcilers that may run in parallel + #[arg(long)] + reconciler_concurrency: Option, } enum StrictMode { @@ -243,6 +249,9 @@ async fn async_main() -> anyhow::Result<()> { .max_unavailable_interval .map(humantime::Duration::into) .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT), + reconciler_concurrency: args + .reconciler_concurrency + .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), }; // After loading secrets & config, but before starting anything else, apply database migrations diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 49cfaad569..28801ede6e 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -51,6 +51,10 @@ pub(super) struct Reconciler { /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry. pub(crate) compute_notify_failure: bool, + /// Reconciler is responsible for keeping alive semaphore units that limit concurrency on how many + /// we will spawn. + pub(crate) _resource_units: ReconcileUnits, + /// A means to abort background reconciliation: it is essential to /// call this when something changes in the original TenantShard that /// will make this reconciliation impossible or unnecessary, for @@ -66,6 +70,19 @@ pub(super) struct Reconciler { pub(crate) persistence: Arc, } +/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O +pub(crate) struct ReconcileUnits { + _sem_units: tokio::sync::OwnedSemaphorePermit, +} + +impl ReconcileUnits { + pub(crate) fn new(sem_units: tokio::sync::OwnedSemaphorePermit) -> Self { + Self { + _sem_units: sem_units, + } + } +} + /// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any /// reference counting for Scheduler. The IntentState is what the scheduler works with, /// and the TargetState is just the instruction for a particular Reconciler run. diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 0565f8e7b4..2e6f3750e7 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -10,8 +10,9 @@ use std::{ use crate::{ id_lock_map::IdLockMap, persistence::{AbortShardSplitStatus, TenantFilter}, - reconciler::ReconcileError, + reconciler::{ReconcileError, ReconcileUnits}, scheduler::{ScheduleContext, ScheduleMode}, + tenant_shard::ReconcileNeeded, }; use anyhow::Context; use control_plane::storage_controller::{ @@ -48,7 +49,7 @@ use pageserver_api::{ }, }; use pageserver_client::mgmt_api; -use tokio::sync::OwnedRwLockWriteGuard; +use tokio::sync::{mpsc::error::TrySendError, OwnedRwLockWriteGuard}; use tokio_util::sync::CancellationToken; use tracing::instrument; use utils::{ @@ -90,6 +91,13 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30); +pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; + +// Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. +// This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly +// than they're being pushed onto the queue. +const MAX_DELAYED_RECONCILES: usize = 10000; + // Top level state available to all HTTP handlers struct ServiceState { tenants: BTreeMap, @@ -97,6 +105,9 @@ struct ServiceState { nodes: Arc>, scheduler: Scheduler, + + /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile + delayed_reconcile_rx: tokio::sync::mpsc::Receiver, } impl ServiceState { @@ -104,11 +115,13 @@ impl ServiceState { nodes: HashMap, tenants: BTreeMap, scheduler: Scheduler, + delayed_reconcile_rx: tokio::sync::mpsc::Receiver, ) -> Self { Self { tenants, nodes: Arc::new(nodes), scheduler, + delayed_reconcile_rx, } } @@ -142,6 +155,9 @@ pub struct Config { /// considered active. Once the grace period elapses, the next heartbeat failure will /// mark the pagseserver offline. pub max_unavailable_interval: Duration, + + /// How many Reconcilers may be spawned concurrently + pub reconciler_concurrency: usize, } impl From for ApiError { @@ -180,6 +196,17 @@ pub struct Service { // that transition it to/from Active. node_op_locks: IdLockMap, + // Limit how many Reconcilers we will spawn concurrently + reconciler_concurrency: Arc, + + /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile + /// Send into this queue to promptly attempt to reconcile this shard next time units are available. + /// + /// Note that this state logically lives inside ServiceInner, but carrying Sender here makes the code simpler + /// by avoiding needing a &mut ref to something inside the ServiceInner. This could be optimized to + /// use a VecDeque instead of a channel to reduce synchronization overhead, at the cost of some code complexity. + delayed_reconcile_tx: tokio::sync::mpsc::Sender, + // Process shutdown will fire this token cancel: CancellationToken, @@ -742,8 +769,9 @@ impl Service { } /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation - /// was successful, this will update the observed state of the tenant such that subsequent - /// calls to [`TenantShard::maybe_reconcile`] will do nothing. + /// was successful and intent hasn't changed since the Reconciler was spawned, this will update + /// the observed state of the tenant such that subsequent calls to [`TenantShard::get_reconcile_needed`] + /// will indicate that reconciliation is not needed. #[instrument(skip_all, fields( tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(), sequence=%result.sequence @@ -804,6 +832,21 @@ impl Service { } } } + + // Maybe some other work can proceed now that this job finished. + if self.reconciler_concurrency.available_permits() > 0 { + while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() { + let (nodes, tenants, _scheduler) = locked.parts_mut(); + if let Some(shard) = tenants.get_mut(&tenant_shard_id) { + shard.delayed_reconcile = false; + self.maybe_reconcile_shard(shard, nodes); + } + + if self.reconciler_concurrency.available_permits() == 0 { + break; + } + } + } } async fn process_results( @@ -986,6 +1029,9 @@ impl Service { let (startup_completion, startup_complete) = utils::completion::channel(); + let (delayed_reconcile_tx, delayed_reconcile_rx) = + tokio::sync::mpsc::channel(MAX_DELAYED_RECONCILES); + let cancel = CancellationToken::new(); let heartbeater = Heartbeater::new( config.jwt_token.clone(), @@ -994,13 +1040,20 @@ impl Service { ); let this = Arc::new(Self { inner: Arc::new(std::sync::RwLock::new(ServiceState::new( - nodes, tenants, scheduler, + nodes, + tenants, + scheduler, + delayed_reconcile_rx, ))), config: config.clone(), persistence, - compute_hook: Arc::new(ComputeHook::new(config)), + compute_hook: Arc::new(ComputeHook::new(config.clone())), result_tx, heartbeater, + reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new( + config.reconciler_concurrency, + )), + delayed_reconcile_tx, abort_tx, startup_complete: startup_complete.clone(), cancel, @@ -1535,7 +1588,7 @@ impl Service { let (response, waiters) = self.do_tenant_create(create_req).await?; - if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await { + if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await { // Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to // accept compute notifications while it is in the process of creating. Reconciliation will // be retried in the background. @@ -4053,20 +4106,64 @@ impl Service { Ok(()) } - /// Convenience wrapper around [`TenantShard::maybe_reconcile`] that provides - /// all the references to parts of Self that are needed + /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`], fn maybe_reconcile_shard( &self, shard: &mut TenantShard, nodes: &Arc>, ) -> Option { - shard.maybe_reconcile( + let reconcile_needed = shard.get_reconcile_needed(nodes); + + match reconcile_needed { + ReconcileNeeded::No => return None, + ReconcileNeeded::WaitExisting(waiter) => return Some(waiter), + ReconcileNeeded::Yes => { + // Fall through to try and acquire units for spawning reconciler + } + }; + + let units = match self.reconciler_concurrency.clone().try_acquire_owned() { + Ok(u) => ReconcileUnits::new(u), + Err(_) => { + tracing::info!(tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(), + "Concurrency limited: enqueued for reconcile later"); + if !shard.delayed_reconcile { + match self.delayed_reconcile_tx.try_send(shard.tenant_shard_id) { + Err(TrySendError::Closed(_)) => { + // Weird mid-shutdown case? + } + Err(TrySendError::Full(_)) => { + // It is safe to skip sending our ID in the channel: we will eventually get retried by the background reconcile task. + tracing::warn!( + "Many shards are waiting to reconcile: delayed_reconcile queue is full" + ); + } + Ok(()) => { + shard.delayed_reconcile = true; + } + } + } + + // We won't spawn a reconciler, but we will construct a waiter that waits for the shard's sequence + // number to advance. When this function is eventually called again and succeeds in getting units, + // it will spawn a reconciler that makes this waiter complete. + return Some(shard.future_reconcile_waiter()); + } + }; + + let Ok(gate_guard) = self.gate.enter() else { + // Gate closed: we're shutting down, drop out. + return None; + }; + + shard.spawn_reconciler( &self.result_tx, nodes, &self.compute_hook, &self.config, &self.persistence, - &self.gate, + units, + gate_guard, &self.cancel, ) } @@ -4088,6 +4185,11 @@ impl Service { schedule_context = ScheduleContext::default(); } + // Skip checking if this shard is already enqueued for reconciliation + if shard.delayed_reconcile { + continue; + } + // Eventual consistency: if an earlier reconcile job failed, and the shard is still // dirty, spawn another rone if self.maybe_reconcile_shard(shard, &pageservers).is_some() { diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 58b8ef8d5d..d69260b9e7 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -7,6 +7,7 @@ use std::{ use crate::{ metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, persistence::TenantShardPersistence, + reconciler::ReconcileUnits, scheduler::{AffinityScore, MaySchedule, ScheduleContext}, }; use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy}; @@ -22,7 +23,7 @@ use utils::{ generation::Generation, id::NodeId, seqwait::{SeqWait, SeqWaitError}, - sync::gate::Gate, + sync::gate::GateGuard, }; use crate::{ @@ -95,6 +96,10 @@ pub(crate) struct TenantShard { /// reconciliation, and timeline creation. pub(crate) splitting: SplitState, + /// If a tenant was enqueued for later reconcile due to hitting concurrency limit, this flag + /// is set. This flag is cleared when the tenant is popped off the delay queue. + pub(crate) delayed_reconcile: bool, + /// Optionally wait for reconciliation to complete up to a particular /// sequence number. #[serde(skip)] @@ -113,8 +118,8 @@ pub(crate) struct TenantShard { pub(crate) last_error: std::sync::Arc>, /// If we have a pending compute notification that for some reason we weren't able to send, - /// set this to true. If this is set, calls to [`Self::maybe_reconcile`] will run a task to retry - /// sending it. This is the mechanism by which compute notifications are included in the scope + /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes + /// and trigger a Reconciler run. This is the mechanism by which compute notifications are included in the scope /// of state that we publish externally in an eventually consistent way. pub(crate) pending_compute_notification: bool, @@ -353,6 +358,17 @@ pub(crate) struct ReconcilerHandle { cancel: CancellationToken, } +pub(crate) enum ReconcileNeeded { + /// shard either doesn't need reconciliation, or is forbidden from spawning a reconciler + /// in its current state (e.g. shard split in progress, or ShardSchedulingPolicy forbids it) + No, + /// shard has a reconciler running, and its intent hasn't changed since that one was + /// spawned: wait for the existing reconciler rather than spawning a new one. + WaitExisting(ReconcilerWaiter), + /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`] + Yes, +} + /// When a reconcile task completes, it sends this result object /// to be applied to the primary TenantShard. pub(crate) struct ReconcileResult { @@ -396,6 +412,7 @@ impl TenantShard { reconciler: None, splitting: SplitState::Idle, sequence: Sequence(1), + delayed_reconcile: false, waiter: Arc::new(SeqWait::new(Sequence(0))), error_waiter: Arc::new(SeqWait::new(Sequence(0))), last_error: Arc::default(), @@ -831,16 +848,10 @@ impl TenantShard { #[allow(clippy::too_many_arguments)] #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] - pub(crate) fn maybe_reconcile( + pub(crate) fn get_reconcile_needed( &mut self, - result_tx: &tokio::sync::mpsc::UnboundedSender, pageservers: &Arc>, - compute_hook: &Arc, - service_config: &service::Config, - persistence: &Arc, - gate: &Gate, - cancel: &CancellationToken, - ) -> Option { + ) -> ReconcileNeeded { // If there are any ambiguous observed states, and the nodes they refer to are available, // we should reconcile to clean them up. let mut dirty_observed = false; @@ -863,7 +874,7 @@ impl TenantShard { if !do_reconcile { tracing::info!("Not dirty, no reconciliation needed."); - return None; + return ReconcileNeeded::No; } // If we are currently splitting, then never start a reconciler task: the splitting logic @@ -871,7 +882,7 @@ impl TenantShard { // up top, so that we only log this message if we would otherwise have done a reconciliation. if !matches!(self.splitting, SplitState::Idle) { tracing::info!("Refusing to reconcile, splitting in progress"); - return None; + return ReconcileNeeded::No; } // Reconcile already in flight for the current sequence? @@ -881,7 +892,7 @@ impl TenantShard { "Reconciliation already in progress for sequence {:?}", self.sequence, ); - return Some(ReconcilerWaiter { + return ReconcileNeeded::WaitExisting(ReconcilerWaiter { tenant_shard_id: self.tenant_shard_id, seq_wait: self.waiter.clone(), error_seq_wait: self.error_waiter.clone(), @@ -900,10 +911,67 @@ impl TenantShard { // We only reach this point if there is work to do and we're going to skip // doing it: warn it obvious why this tenant isn't doing what it ought to. tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy); - return None; + return ReconcileNeeded::No; } } + ReconcileNeeded::Yes + } + + /// Ensure the sequence number is set to a value where waiting for this value will make us wait + /// for the next reconcile: i.e. it is ahead of all completed or running reconcilers. + /// + /// Constructing a ReconcilerWaiter with the resulting sequence number gives the property + /// that the waiter will not complete until some future Reconciler is constructed and run. + fn ensure_sequence_ahead(&mut self) { + // Find the highest sequence for which a Reconciler has previously run or is currently + // running + let max_seen = std::cmp::max( + self.reconciler + .as_ref() + .map(|r| r.sequence) + .unwrap_or(Sequence(0)), + std::cmp::max(self.waiter.load(), self.error_waiter.load()), + ); + + if self.sequence <= max_seen { + self.sequence = max_seen.next(); + } + } + + /// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet. + /// + /// This is appropriate when you can't spawn a recociler (e.g. due to resource limits), but + /// you would like to wait until one gets spawned in the background. + pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter { + self.ensure_sequence_ahead(); + + ReconcilerWaiter { + tenant_shard_id: self.tenant_shard_id, + seq_wait: self.waiter.clone(), + error_seq_wait: self.error_waiter.clone(), + error: self.last_error.clone(), + seq: self.sequence, + } + } + + #[allow(clippy::too_many_arguments)] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) fn spawn_reconciler( + &mut self, + result_tx: &tokio::sync::mpsc::UnboundedSender, + pageservers: &Arc>, + compute_hook: &Arc, + service_config: &service::Config, + persistence: &Arc, + units: ReconcileUnits, + gate_guard: GateGuard, + cancel: &CancellationToken, + ) -> Option { + // Reconcile in flight for a stale sequence? Our sequence's task will wait for it before + // doing our sequence's work. + let old_handle = self.reconciler.take(); + // Build list of nodes from which the reconciler should detach let mut detach = Vec::new(); for node_id in self.observed.locations.keys() { @@ -919,18 +987,9 @@ impl TenantShard { } } - // Reconcile in flight for a stale sequence? Our sequence's task will wait for it before - // doing our sequence's work. - let old_handle = self.reconciler.take(); - - let Ok(gate_guard) = gate.enter() else { - // Shutting down, don't start a reconciler - return None; - }; - // Advance the sequence before spawning a reconciler, so that sequence waiters // can distinguish between before+after the reconcile completes. - self.sequence = self.sequence.next(); + self.ensure_sequence_ahead(); let reconciler_cancel = cancel.child_token(); let reconciler_intent = TargetState::from_intent(pageservers, &self.intent); @@ -945,6 +1004,7 @@ impl TenantShard { compute_hook: compute_hook.clone(), service_config: service_config.clone(), _gate_guard: gate_guard, + _resource_units: units, cancel: reconciler_cancel.clone(), persistence: persistence.clone(), compute_notify_failure: false, @@ -1011,16 +1071,18 @@ impl TenantShard { status: outcome_label, }); - result_tx - .send(ReconcileResult { - sequence: reconcile_seq, - result, - tenant_shard_id: reconciler.tenant_shard_id, - generation: reconciler.generation, - observed: reconciler.observed, - pending_compute_notification: reconciler.compute_notify_failure, - }) - .ok(); + // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might + // try and schedule more work in response to our result. + let result = ReconcileResult { + sequence: reconcile_seq, + result, + tenant_shard_id: reconciler.tenant_shard_id, + generation: reconciler.generation, + observed: reconciler.observed, + pending_compute_notification: reconciler.compute_notify_failure, + }; + + result_tx.send(result).ok(); } .instrument(reconciler_span), ); @@ -1111,6 +1173,7 @@ impl TenantShard { error_waiter: Arc::new(SeqWait::new(Sequence::initial())), last_error: Arc::default(), pending_compute_notification: false, + delayed_reconcile: false, scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), }) }