From 96bd74697a7601c8f20abb44e9f7a80c34650933 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 7 Jan 2025 17:32:53 +0100 Subject: [PATCH] storage_controller: remove node activation reconciliation --- storage_controller/src/reconciler.rs | 5 +- storage_controller/src/service.rs | 153 ++------------------------- 2 files changed, 13 insertions(+), 145 deletions(-) diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index e0a854fff7..fbb71f7995 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -211,9 +211,8 @@ impl Reconciler { lazy: bool, ) -> Result<(), ReconcileError> { if !node.is_available() && config.mode == LocationConfigMode::Detached { - // [`crate::service::Service::node_activate_reconcile`] will update the observed state - // when the node comes back online. At that point, the intent and observed states will - // be mismatched and a background reconciliation will detach. + // Skip if we cannot detach right now due to the node being unavailable. It will be + // handled by the background reconciliation loop. tracing::info!( "Node {node} is unavailable during detach: proceeding anyway, it will be detached via background reconciliation" ); diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 359fcb3288..eb356941ec 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1030,7 +1030,7 @@ impl Service { // This is the code path for geniune availability transitions (i.e node // goes unavailable and/or comes back online). let res = self - .node_state_configure(node_id, Some(new_availability), None, &node_lock) + .node_state_configure(node_id, Some(new_availability), None) .await; match res { @@ -1317,7 +1317,7 @@ impl Service { if count_min != count_max { // Aborting the split in the database and dropping the child shards is sufficient: the reconciliation in // [`Self::startup_reconcile`] will implicitly drop the child shards on remote pageservers, or they'll - // be dropped later in [`Self::node_activate_reconcile`] if it isn't available right now. + // be dropped later via background reconciliation if it isn't available right now. tracing::info!("Aborting shard split {tenant_id} {count_min:?} -> {count_max:?}"); let abort_status = persistence.abort_shard_split(tenant_id, count_max).await?; @@ -1727,118 +1727,6 @@ impl Service { } } - // When the availability state of a node transitions to active, we must do a full reconciliation - // of LocationConfigs on that node. This is because while a node was offline: - // - we might have proceeded through startup_reconcile without checking for extraneous LocationConfigs on this node - // - aborting a tenant shard split might have left rogue child shards behind on this node. - // - // This function must complete _before_ setting a `Node` to Active: once it is set to Active, other - // Reconcilers might communicate with the node, and these must not overlap with the work we do in - // this function. - // - // The reconciliation logic in here is very similar to what [`Self::startup_reconcile`] does, but - // for written for a single node rather than as a batch job for all nodes. - #[tracing::instrument(skip_all, fields(node_id=%node.get_id()))] - async fn node_activate_reconcile( - &self, - mut node: Node, - _lock: &TracingExclusiveGuard, - ) -> Result<(), ApiError> { - // This Node is a mutable local copy: we will set it active so that we can use its - // API client to reconcile with the node. The Node in [`Self::nodes`] will get updated - // later. - node.set_availability(NodeAvailability::Active(PageserverUtilization::full())); - - let configs = match node - .with_client_retries( - |client| async move { client.list_location_config().await }, - &self.config.jwt_token, - 1, - 5, - SHORT_RECONCILE_TIMEOUT, - &self.cancel, - ) - .await - { - None => { - // We're shutting down (the Node's cancellation token can't have fired, because - // we're the only scope that has a reference to it, and we didn't fire it). - return Err(ApiError::ShuttingDown); - } - Some(Err(e)) => { - // This node didn't succeed listing its locations: it may not proceed to active state - // as it is apparently unavailable. - return Err(ApiError::PreconditionFailed( - format!("Failed to query node location configs, cannot activate ({e})").into(), - )); - } - Some(Ok(configs)) => configs, - }; - tracing::info!("Loaded {} LocationConfigs", configs.tenant_shards.len()); - - let mut cleanup = Vec::new(); - { - let mut locked = self.inner.write().unwrap(); - - for (tenant_shard_id, observed_loc) in configs.tenant_shards { - let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else { - cleanup.push(tenant_shard_id); - continue; - }; - tenant_shard - .observed - .locations - .insert(node.get_id(), ObservedStateLocation { conf: observed_loc }); - } - } - - for tenant_shard_id in cleanup { - tracing::info!("Detaching {tenant_shard_id}"); - match node - .with_client_retries( - |client| async move { - let config = LocationConfig { - mode: LocationConfigMode::Detached, - generation: None, - secondary_conf: None, - shard_number: tenant_shard_id.shard_number.0, - shard_count: tenant_shard_id.shard_count.literal(), - shard_stripe_size: 0, - tenant_conf: models::TenantConfig::default(), - }; - client - .location_config(tenant_shard_id, config, None, false) - .await - }, - &self.config.jwt_token, - 1, - 5, - SHORT_RECONCILE_TIMEOUT, - &self.cancel, - ) - .await - { - None => { - // We're shutting down (the Node's cancellation token can't have fired, because - // we're the only scope that has a reference to it, and we didn't fire it). - return Err(ApiError::ShuttingDown); - } - Some(Err(e)) => { - // Do not let the node proceed to Active state if it is not responsive to requests - // to detach. This could happen if e.g. a shutdown bug in the pageserver is preventing - // detach completing: we should not let this node back into the set of nodes considered - // okay for scheduling. - return Err(ApiError::Conflict(format!( - "Node {node} failed to detach {tenant_shard_id}: {e}" - ))); - } - Some(Ok(_)) => {} - }; - } - - Ok(()) - } - pub(crate) async fn re_attach( &self, reattach_req: ReAttachRequest, @@ -1926,9 +1814,8 @@ impl Service { } } - // We consider a node Active once we have composed a re-attach response, but we - // do not call [`Self::node_activate_reconcile`]: the handling of the re-attach response - // implicitly synchronizes the LocationConfigs on the node. + // We consider a node Active once we have composed a re-attach response: + // the handling of the re-attach response implicitly synchronizes the LocationConfigs on the node. // // Setting a node active unblocks any Reconcilers that might write to the location config API, // but those requests will not be accepted by the node until it has finished processing @@ -5511,7 +5398,6 @@ impl Service { node_id: NodeId, availability: Option, scheduling: Option, - node_lock: &TracingExclusiveGuard, ) -> Result { if let Some(scheduling) = scheduling { // Scheduling is a persistent part of Node: we must write updates to the database before @@ -5519,32 +5405,15 @@ impl Service { self.persistence.update_node(node_id, scheduling).await?; } - // If we're activating a node, then before setting it active we must reconcile any shard locations - // on that node, in case it is out of sync, e.g. due to being unavailable during controller startup, - // by calling [`Self::node_activate_reconcile`] - // - // The transition we calculate here remains valid later in the function because we hold the op lock on the node: - // nothing else can mutate its availability while we run. let availability_transition = if let Some(input_availability) = availability.as_ref() { - let (activate_node, availability_transition) = { - let locked = self.inner.read().unwrap(); - let Some(node) = locked.nodes.get(&node_id) else { - return Err(ApiError::NotFound( - anyhow::anyhow!("Node {} not registered", node_id).into(), - )); - }; - - ( - node.clone(), - node.get_availability_transition(input_availability), - ) + let locked = self.inner.read().unwrap(); + let Some(node) = locked.nodes.get(&node_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Node {} not registered", node_id).into(), + )); }; - if matches!(availability_transition, AvailabilityTransition::ToActive) { - self.node_activate_reconcile(activate_node, node_lock) - .await?; - } - availability_transition + node.get_availability_transition(input_availability) } else { AvailabilityTransition::Unchanged }; @@ -5734,7 +5603,7 @@ impl Service { trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Configure).await; let transition = self - .node_state_configure(node_id, availability, scheduling, &node_lock) + .node_state_configure(node_id, availability, scheduling) .await?; self.handle_node_availability_transition(node_id, transition, &node_lock) .await