storcon: Make node deletion process cancellable (#12320)

## Problem

The current deletion operation is synchronous and blocking, which is
unsuitable for potentially long-running tasks like. In such cases, the
standard HTTP request-response pattern is not a good fit.

## Summary of Changes

- Added new `storcon_cli` commands: `NodeStartDelete` and
`NodeCancelDelete` to initiate and cancel deletion asynchronously.
- Added corresponding `storcon` HTTP handlers to support the new
start/cancel deletion flow.
- Introduced a new type of background operation: `Delete`, to track and
manage the deletion process outside the request lifecycle.

---------

Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
This commit is contained in:
Aleksandr Sarantsev
2025-07-04 18:08:09 +04:00
committed by GitHub
parent 225267b3ae
commit b2705cfee6
12 changed files with 698 additions and 172 deletions

View File

@@ -635,18 +635,23 @@ impl Persistence {
let updated = self
.with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
Box::pin(async move {
// Check if the node is not marked as deleted
let deleted_node: i64 = nodes
let node: Option<NodePersistence> = nodes
.filter(node_id.eq(input_node_id.0 as i64))
.filter(lifecycle.eq(String::from(NodeLifecycle::Deleted)))
.count()
.get_result(conn)
.await?;
if deleted_node > 0 {
return Err(DatabaseError::Logical(format!(
"Node {input_node_id} is marked as deleted, re-attach is not allowed"
)));
}
.first::<NodePersistence>(conn)
.await
.optional()?;
// Check if the node is not marked as deleted
match node {
Some(node) if matches!(NodeLifecycle::from_str(&node.lifecycle), Ok(NodeLifecycle::Deleted)) => {
return Err(DatabaseError::Logical(format!(
"Node {input_node_id} is marked as deleted, re-attach is not allowed"
)));
}
_ => {
// go through
}
};
let rows_updated = diesel::update(tenant_shards)
.filter(generation_pageserver.eq(input_node_id.0 as i64))
@@ -664,21 +669,23 @@ impl Persistence {
.load(conn)
.await?;
// If the node went through a drain and restart phase before re-attaching,
// then reset it's node scheduling policy to active.
diesel::update(nodes)
.filter(node_id.eq(input_node_id.0 as i64))
.filter(
scheduling_policy
.eq(String::from(NodeSchedulingPolicy::PauseForRestart))
.or(scheduling_policy
.eq(String::from(NodeSchedulingPolicy::Draining)))
.or(scheduling_policy
.eq(String::from(NodeSchedulingPolicy::Filling))),
)
.set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
.execute(conn)
.await?;
if let Some(node) = node {
let old_scheduling_policy =
NodeSchedulingPolicy::from_str(&node.scheduling_policy).unwrap();
let new_scheduling_policy = match old_scheduling_policy {
NodeSchedulingPolicy::Active => NodeSchedulingPolicy::Active,
NodeSchedulingPolicy::PauseForRestart => NodeSchedulingPolicy::Active,
NodeSchedulingPolicy::Draining => NodeSchedulingPolicy::Active,
NodeSchedulingPolicy::Filling => NodeSchedulingPolicy::Active,
NodeSchedulingPolicy::Pause => NodeSchedulingPolicy::Pause,
NodeSchedulingPolicy::Deleting => NodeSchedulingPolicy::Pause,
};
diesel::update(nodes)
.filter(node_id.eq(input_node_id.0 as i64))
.set(scheduling_policy.eq(String::from(new_scheduling_policy)))
.execute(conn)
.await?;
}
Ok(updated)
})