storcon: Make node deletion process cancellable (#12320)

## Problem The current deletion operation is synchronous and blocking, which is unsuitable for potentially long-running tasks like. In such cases, the standard HTTP request-response pattern is not a good fit. ## Summary of Changes - Added new `storcon_cli` commands: `NodeStartDelete` and `NodeCancelDelete` to initiate and cancel deletion asynchronously. - Added corresponding `storcon` HTTP handlers to support the new start/cancel deletion flow. - Introduced a new type of background operation: `Delete`, to track and manage the deletion process outside the request lifecycle. --------- Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
2026-05-27 01:50:38 +00:00 · 2025-07-04 18:08:09 +04:00
parent 225267b3ae
commit b2705cfee6
12 changed files with 698 additions and 172 deletions
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -635,18 +635,23 @@ impl Persistence {
        let updated = self
            .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
                Box::pin(async move {
-                    // Check if the node is not marked as deleted
-                    let deleted_node: i64 = nodes
+                    let node: Option<NodePersistence> = nodes
                        .filter(node_id.eq(input_node_id.0 as i64))
-                        .filter(lifecycle.eq(String::from(NodeLifecycle::Deleted)))
-                        .count()
-                        .get_result(conn)
-                        .await?;
-                    if deleted_node > 0 {
-                        return Err(DatabaseError::Logical(format!(
-                            "Node {input_node_id} is marked as deleted, re-attach is not allowed"
-                        )));
-                    }
+                        .first::<NodePersistence>(conn)
+                        .await
+                        .optional()?;
+
+                    // Check if the node is not marked as deleted
+                    match node {
+                        Some(node) if matches!(NodeLifecycle::from_str(&node.lifecycle), Ok(NodeLifecycle::Deleted)) => {
+                            return Err(DatabaseError::Logical(format!(
+                                "Node {input_node_id} is marked as deleted, re-attach is not allowed"
+                            )));
+                        }
+                        _ => {
+                            // go through
+                        }
+                    };

                    let rows_updated = diesel::update(tenant_shards)
                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
@@ -664,21 +669,23 @@ impl Persistence {
                        .load(conn)
                        .await?;

-                    // If the node went through a drain and restart phase before re-attaching,
-                    // then reset it's node scheduling policy to active.
-                    diesel::update(nodes)
-                        .filter(node_id.eq(input_node_id.0 as i64))
-                        .filter(
-                            scheduling_policy
-                                .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
-                                .or(scheduling_policy
-                                    .eq(String::from(NodeSchedulingPolicy::Draining)))
-                                .or(scheduling_policy
-                                    .eq(String::from(NodeSchedulingPolicy::Filling))),
-                        )
-                        .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
-                        .execute(conn)
-                        .await?;
+                    if let Some(node) = node {
+                        let old_scheduling_policy =
+                            NodeSchedulingPolicy::from_str(&node.scheduling_policy).unwrap();
+                        let new_scheduling_policy = match old_scheduling_policy {
+                            NodeSchedulingPolicy::Active => NodeSchedulingPolicy::Active,
+                            NodeSchedulingPolicy::PauseForRestart => NodeSchedulingPolicy::Active,
+                            NodeSchedulingPolicy::Draining => NodeSchedulingPolicy::Active,
+                            NodeSchedulingPolicy::Filling => NodeSchedulingPolicy::Active,
+                            NodeSchedulingPolicy::Pause => NodeSchedulingPolicy::Pause,
+                            NodeSchedulingPolicy::Deleting => NodeSchedulingPolicy::Pause,
+                        };
+                        diesel::update(nodes)
+                            .filter(node_id.eq(input_node_id.0 as i64))
+                            .set(scheduling_policy.eq(String::from(new_scheduling_policy)))
+                            .execute(conn)
+                            .await?;
+                    }

                    Ok(updated)
                })