From a0fa1b928dc05fe5bdc07cb0bc1ea0245454e76d Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Fri, 14 Jun 2024 15:21:34 +0100 Subject: [PATCH] storcon: refine heartbeat handling of new addded nodes --- storage_controller/src/heartbeater.rs | 4 ++ storage_controller/src/service.rs | 76 ++++++++++++++++++++------- 2 files changed, 61 insertions(+), 19 deletions(-) diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs index 1ef97e78eb..14cda0a289 100644 --- a/storage_controller/src/heartbeater.rs +++ b/storage_controller/src/heartbeater.rs @@ -31,6 +31,7 @@ pub(crate) enum PageserverState { Available { last_seen_at: Instant, utilization: PageserverUtilization, + new: bool, }, Offline, } @@ -127,6 +128,7 @@ impl HeartbeaterTask { heartbeat_futs.push({ let jwt_token = self.jwt_token.clone(); let cancel = self.cancel.clone(); + let new_node = !self.state.contains_key(node_id); // Clone the node and mark it as available such that the request // goes through to the pageserver even when the node is marked offline. @@ -159,6 +161,7 @@ impl HeartbeaterTask { PageserverState::Available { last_seen_at: Instant::now(), utilization, + new: new_node, } } else { PageserverState::Offline @@ -220,6 +223,7 @@ impl HeartbeaterTask { } }, Vacant(_) => { + // This is a new node. Don't generate a delta for it. deltas.push((node_id, ps_state.clone())); } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index ff9c1434f3..07c8065b5c 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -747,29 +747,61 @@ impl Service { let res = self.heartbeater.heartbeat(nodes).await; if let Ok(deltas) = res { for (node_id, state) in deltas.0 { - let new_availability = match state { - PageserverState::Available { utilization, .. } => NodeAvailability::Active( - UtilizationScore(utilization.utilization_score), + let (new_node, new_availability) = match state { + PageserverState::Available { + utilization, new, .. + } => ( + new, + NodeAvailability::Active(UtilizationScore( + utilization.utilization_score, + )), ), - PageserverState::Offline => NodeAvailability::Offline, + PageserverState::Offline => (false, NodeAvailability::Offline), }; - let res = self - .node_configure(node_id, Some(new_availability), None) - .await; - match res { - Ok(()) => {} - Err(ApiError::NotFound(_)) => { - // This should be rare, but legitimate since the heartbeats are done - // on a snapshot of the nodes. - tracing::info!("Node {} was not found after heartbeat round", node_id); + if new_node { + // When the heartbeats detect a newly added node, we don't wish + // to attempt to reconcile the shards assigned to it. The node + // is likely handling it's re-attach response, so reconciling now + // would be counterproductive. + // + // Instead, update the in-memory state with the details learned about the + // node. + let mut locked = self.inner.write().unwrap(); + let (nodes, _tenants, scheduler) = locked.parts_mut(); + + let mut new_nodes = (**nodes).clone(); + + if let Some(node) = new_nodes.get_mut(&node_id) { + node.set_availability(new_availability); + scheduler.node_upsert(node); } - Err(err) => { - tracing::error!( - "Failed to update node {} after heartbeat round: {}", - node_id, - err - ); + + locked.nodes = Arc::new(new_nodes); + } else { + // This is the code path for geniune availability transitions (i.e node + // goes unavailable and/or comes back online). + let res = self + .node_configure(node_id, Some(new_availability), None) + .await; + + match res { + Ok(()) => {} + Err(ApiError::NotFound(_)) => { + // This should be rare, but legitimate since the heartbeats are done + // on a snapshot of the nodes. + tracing::info!( + "Node {} was not found after heartbeat round", + node_id + ); + } + Err(err) => { + tracing::error!( + "Failed to update node {} after heartbeat round: {}", + node_id, + err + ); + } } } } @@ -4356,6 +4388,12 @@ impl Service { // When a node comes back online, we must reconcile any tenant that has a None observed // location on the node. for tenant_shard in locked.tenants.values_mut() { + // If a reconciliation is already in progress, rely on the previous scheduling + // decision and skip triggering a new reconciliation. + if tenant_shard.reconciler.is_some() { + continue; + } + if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { if observed_loc.conf.is_none() { self.maybe_reconcile_shard(tenant_shard, &new_nodes);