storage controller: add node deletion API (#8226)

## Problem In anticipation of later adding a really nice drain+delete API, I initially only added an intentionally basic `/drop` API that is just about usable for deleting nodes in a pinch, but requires some ugly storage controller restarts to persuade it to restart secondaries. ## Summary of changes I started making a few tiny fixes, and ended up writing the delete API... - Quality of life nit: ordering of node + tenant listings in storcon_cli - Papercut: Fix the attach_hook using the wrong operation type for reporting slow locks - Make Service::spawn tolerate `generation_pageserver` columns that point to nonexistent node IDs. I started out thinking of this as a general resilience thing, but when implementing the delete API I realized it was actually a legitimate end state after the delete API is called (as that API doesn't wait for all reconciles to succeed). - Add a `DELETE` API for nodes, which does not gracefully drain, but does reschedule everything. This becomes safe to use when the system is in any state, but will incur availability gaps for any tenants that weren't already live-migrated away. If tenants have already been drained, this becomes a totally clean + safe way to decom a node. - Add a test and a storcon_cli wrapper for it This is meant to be a robust initial API that lets us remove nodes without doing ugly things like restarting the storage controller -- it's not quite a totally graceful node-draining routine yet. There's more work in https://github.com/neondatabase/neon/issues/8333 to get to our end-end state.
2026-01-10 15:02:56 +00:00 · 2024-07-11 17:05:47 +01:00
parent 0159ae9536
commit 814c8e8f68
7 changed files with 277 additions and 14 deletions
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -456,6 +456,14 @@ async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError
    json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
 }

+async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+    json_response(StatusCode::OK, state.service.node_delete(node_id).await?)
+}
+
 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -878,6 +886,9 @@ pub fn make_router(
        .post("/control/v1/node", |r| {
            named_request_span(r, handle_node_register, RequestName("control_v1_node"))
        })
+        .delete("/control/v1/node/:node_id", |r| {
+            named_request_span(r, handle_node_delete, RequestName("control_v1_node_delete"))
+        })
        .get("/control/v1/node", |r| {
            named_request_span(r, handle_node_list, RequestName("control_v1_node"))
        })
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2,6 +2,7 @@ use std::{
    borrow::Cow,
    cmp::Ordering,
    collections::{BTreeMap, HashMap, HashSet},
+    ops::Deref,
    path::PathBuf,
    str::FromStr,
    sync::Arc,
@@ -115,12 +116,14 @@ enum TenantOperations {
    SecondaryDownload,
    TimelineCreate,
    TimelineDelete,
+    AttachHook,
 }

 #[derive(Clone, strum_macros::Display)]
 enum NodeOperations {
    Register,
    Configure,
+    Delete,
 }

 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
@@ -845,9 +848,10 @@ impl Service {
        tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
        sequence=%result.sequence
    ))]
-    fn process_result(&self, result: ReconcileResult) {
+    fn process_result(&self, mut result: ReconcileResult) {
        let mut locked = self.inner.write().unwrap();
-        let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
+        let (nodes, tenants, _scheduler) = locked.parts_mut();
+        let Some(tenant) = tenants.get_mut(&result.tenant_shard_id) else {
            // A reconciliation result might race with removing a tenant: drop results for
            // tenants that aren't in our map.
            return;
@@ -864,6 +868,13 @@ impl Service {
        // Let the TenantShard know it is idle.
        tenant.reconcile_complete(result.sequence);

+        // In case a node was deleted while this reconcile is in flight, filter it out of the update we will
+        // make to the tenant
+        result
+            .observed
+            .locations
+            .retain(|node_id, _loc| nodes.contains_key(node_id));
+
        match result.result {
            Ok(()) => {
                for (node_id, loc) in &result.observed.locations {
@@ -873,6 +884,7 @@ impl Service {
                        tracing::info!("Setting observed location {} to None", node_id,)
                    }
                }
+
                tenant.observed = result.observed;
                tenant.waiter.advance(result.sequence);
            }
@@ -1109,8 +1121,16 @@ impl Service {
            // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
            // it with what we can infer: the node for which a generation was most recently issued.
            let mut intent = IntentState::new();
-            if let Some(generation_pageserver) = tsp.generation_pageserver {
-                intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
+            if let Some(generation_pageserver) = tsp.generation_pageserver.map(|n| NodeId(n as u64))
+            {
+                if nodes.contains_key(&generation_pageserver) {
+                    intent.set_attached(&mut scheduler, Some(generation_pageserver));
+                } else {
+                    // If a node was removed before being completely drained, it is legal for it to leave behind a `generation_pageserver` referring
+                    // to a non-existent node, because node deletion doesn't block on completing the reconciliations that will issue new generations
+                    // on different pageservers.
+                    tracing::warn!("Tenant shard {tenant_shard_id} references non-existent node {generation_pageserver} in database, will be rescheduled");
+                }
            }
            let new_tenant = TenantShard::from_persistent(tsp, intent)?;

@@ -1237,7 +1257,7 @@ impl Service {
        let _tenant_lock = trace_exclusive_lock(
            &self.tenant_op_locks,
            attach_req.tenant_shard_id.tenant_id,
-            TenantOperations::ShardSplit,
+            TenantOperations::AttachHook,
        )
        .await;

@@ -4210,8 +4230,6 @@ impl Service {
    /// This is for debug/support only: we simply drop all state for a tenant, without
    /// detaching or deleting it on pageservers.  We do not try and re-schedule any
    /// tenants that were on this node.
-    ///
-    /// TODO: proper node deletion API that unhooks things more gracefully
    pub(crate) async fn node_drop(&self, node_id: NodeId) -> Result<(), ApiError> {
        self.persistence.delete_node(node_id).await?;

@@ -4219,6 +4237,7 @@ impl Service {

        for shard in locked.tenants.values_mut() {
            shard.deref_node(node_id);
+            shard.observed.locations.remove(&node_id);
        }

        let mut nodes = (*locked.nodes).clone();
@@ -4230,6 +4249,94 @@ impl Service {
        Ok(())
    }

+    /// If a node has any work on it, it will be rescheduled: this is "clean" in the sense
+    /// that we don't leave any bad state behind in the storage controller, but unclean
+    /// in the sense that we are not carefully draining the node.
+    pub(crate) async fn node_delete(&self, node_id: NodeId) -> Result<(), ApiError> {
+        let _node_lock =
+            trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Delete).await;
+
+        // 1. Atomically update in-memory state:
+        //    - set the scheduling state to Pause to make subsequent scheduling ops skip it
+        //    - update shards' intents to exclude the node, and reschedule any shards whose intents we modified.
+        //    - drop the node from the main nodes map, so that when running reconciles complete they do not
+        //      re-insert references to this node into the ObservedState of shards
+        //    - drop the node from the scheduler
+        {
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, tenants, scheduler) = locked.parts_mut();
+
+            {
+                let mut nodes_mut = (*nodes).deref().clone();
+                match nodes_mut.get_mut(&node_id) {
+                    Some(node) => {
+                        // We do not bother setting this in the database, because we're about to delete the row anyway, and
+                        // if we crash it would not be desirable to leave the node paused after a restart.
+                        node.set_scheduling(NodeSchedulingPolicy::Pause);
+                    }
+                    None => {
+                        tracing::info!(
+                            "Node not found: presuming this is a retry and returning success"
+                        );
+                        return Ok(());
+                    }
+                }
+
+                *nodes = Arc::new(nodes_mut);
+            }
+
+            for (tenant_shard_id, shard) in tenants {
+                if shard.deref_node(node_id) {
+                    // FIXME: we need to build a ScheduleContext that reflects this shard's peers, otherwise
+                    // it won't properly do anti-affinity.
+                    let mut schedule_context = ScheduleContext::default();
+
+                    if let Err(e) = shard.schedule(scheduler, &mut schedule_context) {
+                        // TODO: implement force flag to remove a node even if we can't reschedule
+                        // a tenant
+                        tracing::error!("Refusing to delete node, shard {tenant_shard_id} can't be rescheduled: {e}");
+                        return Err(e.into());
+                    } else {
+                        tracing::info!(
+                            "Rescheduled shard {tenant_shard_id} away from node during deletion"
+                        )
+                    }
+
+                    self.maybe_reconcile_shard(shard, nodes);
+                }
+
+                // Here we remove an existing observed location for the node we're removing, and it will
+                // not be re-added by a reconciler's completion because we filter out removed nodes in
+                // process_result.
+                //
+                // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that
+                // means any reconciles we spawned will know about the node we're deleting, enabling them
+                // to do live migrations if it's still online.
+                shard.observed.locations.remove(&node_id);
+            }
+
+            scheduler.node_remove(node_id);
+
+            {
+                let mut nodes_mut = (**nodes).clone();
+                nodes_mut.remove(&node_id);
+                *nodes = Arc::new(nodes_mut);
+            }
+        }
+
+        // Note: some `generation_pageserver` columns on tenant shards in the database may still refer to
+        // the removed node, as this column means "The pageserver to which this generation was issued", and
+        // their generations won't get updated until the reconcilers moving them away from this node complete.
+        // That is safe because in Service::spawn we only use generation_pageserver if it refers to a node
+        // that exists.
+
+        // 2. Actually delete the node from the database and from in-memory state
+        tracing::info!("Deleting node from database");
+        self.persistence.delete_node(node_id).await?;
+
+        Ok(())
+    }
+
    pub(crate) async fn node_list(&self) -> Result<Vec<Node>, ApiError> {
        let nodes = {
            self.inner
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1229,18 +1229,27 @@ impl TenantShard {
        }
    }

-    // If we had any state at all referring to this node ID, drop it.  Does not
-    // attempt to reschedule.
-    pub(crate) fn deref_node(&mut self, node_id: NodeId) {
+    /// If we had any state at all referring to this node ID, drop it.  Does not
+    /// attempt to reschedule.
+    ///
+    /// Returns true if we modified the node's intent state.
+    pub(crate) fn deref_node(&mut self, node_id: NodeId) -> bool {
+        let mut intent_modified = false;
+
+        // Drop if this node was our attached intent
        if self.intent.attached == Some(node_id) {
            self.intent.attached = None;
+            intent_modified = true;
        }

+        // Drop from the list of secondaries, and check if we modified it
+        let had_secondaries = self.intent.secondary.len();
        self.intent.secondary.retain(|n| n != &node_id);
-
-        self.observed.locations.remove(&node_id);
+        intent_modified |= self.intent.secondary.len() != had_secondaries;

        debug_assert!(!self.intent.all_pageservers().contains(&node_id));
+
+        intent_modified
    }

    pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {