mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 21:12:55 +00:00
storcon: implement endpoints for cancellation of drain and fill operations (#8029)
## Problem There's no way to cancel drain and fill operations. ## Summary of changes Implement HTTP endpoints to allow cancelling of background operations. When the operationis cancelled successfully, the node scheduling policy will revert to `Active`.
This commit is contained in:
@@ -502,6 +502,17 @@ async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
|
||||
state.service.cancel_node_drain(node_id).await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -513,6 +524,17 @@ async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
|
||||
state.service.cancel_node_fill(node_id).await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_split(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
@@ -871,9 +893,23 @@ pub fn make_router(
|
||||
.put("/control/v1/node/:node_id/drain", |r| {
|
||||
named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
|
||||
})
|
||||
.delete("/control/v1/node/:node_id/drain", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_cancel_node_drain,
|
||||
RequestName("control_v1_cancel_node_drain"),
|
||||
)
|
||||
})
|
||||
.put("/control/v1/node/:node_id/fill", |r| {
|
||||
named_request_span(r, handle_node_fill, RequestName("control_v1_node_fill"))
|
||||
})
|
||||
.delete("/control/v1/node/:node_id/fill", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_cancel_node_fill,
|
||||
RequestName("control_v1_cancel_node_fill"),
|
||||
)
|
||||
})
|
||||
// TODO(vlad): endpoint for cancelling drain and fill
|
||||
// Tenant Shard operations
|
||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||
|
||||
@@ -4541,7 +4541,8 @@ impl Service {
|
||||
self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining))
|
||||
.await?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let cancel = self.cancel.child_token();
|
||||
let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
|
||||
|
||||
self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
|
||||
operation: Operation::Drain(Drain { node_id }),
|
||||
@@ -4552,6 +4553,8 @@ impl Service {
|
||||
let service = self.clone();
|
||||
let cancel = cancel.clone();
|
||||
async move {
|
||||
let _gate_guard = gate_guard;
|
||||
|
||||
scopeguard::defer! {
|
||||
let prev = service.inner.write().unwrap().ongoing_operation.take();
|
||||
|
||||
@@ -4593,6 +4596,44 @@ impl Service {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn cancel_node_drain(&self, node_id: NodeId) -> Result<(), ApiError> {
|
||||
let (node_available, node_policy) = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let nodes = &locked.nodes;
|
||||
let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
|
||||
anyhow::anyhow!("Node {} not registered", node_id).into(),
|
||||
))?;
|
||||
|
||||
(node.is_available(), node.get_scheduling())
|
||||
};
|
||||
|
||||
if !node_available {
|
||||
return Err(ApiError::ResourceUnavailable(
|
||||
format!("Node {node_id} is currently unavailable").into(),
|
||||
));
|
||||
}
|
||||
|
||||
if !matches!(node_policy, NodeSchedulingPolicy::Draining) {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
format!("Node {node_id} has no drain in progress").into(),
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
|
||||
if let Operation::Drain(drain) = op_handler.operation {
|
||||
if drain.node_id == node_id {
|
||||
tracing::info!("Cancelling background drain operation for node {node_id}");
|
||||
op_handler.cancel.cancel();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(ApiError::PreconditionFailed(
|
||||
format!("Node {node_id} has no drain in progress").into(),
|
||||
))
|
||||
}
|
||||
|
||||
pub(crate) async fn start_node_fill(self: &Arc<Self>, node_id: NodeId) -> Result<(), ApiError> {
|
||||
let (ongoing_op, node_available, node_policy, total_nodes_count) = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
@@ -4635,7 +4676,8 @@ impl Service {
|
||||
self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Filling))
|
||||
.await?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let cancel = self.cancel.child_token();
|
||||
let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
|
||||
|
||||
self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
|
||||
operation: Operation::Fill(Fill { node_id }),
|
||||
@@ -4646,6 +4688,8 @@ impl Service {
|
||||
let service = self.clone();
|
||||
let cancel = cancel.clone();
|
||||
async move {
|
||||
let _gate_guard = gate_guard;
|
||||
|
||||
scopeguard::defer! {
|
||||
let prev = service.inner.write().unwrap().ongoing_operation.take();
|
||||
|
||||
@@ -4687,6 +4731,44 @@ impl Service {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn cancel_node_fill(&self, node_id: NodeId) -> Result<(), ApiError> {
|
||||
let (node_available, node_policy) = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let nodes = &locked.nodes;
|
||||
let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
|
||||
anyhow::anyhow!("Node {} not registered", node_id).into(),
|
||||
))?;
|
||||
|
||||
(node.is_available(), node.get_scheduling())
|
||||
};
|
||||
|
||||
if !node_available {
|
||||
return Err(ApiError::ResourceUnavailable(
|
||||
format!("Node {node_id} is currently unavailable").into(),
|
||||
));
|
||||
}
|
||||
|
||||
if !matches!(node_policy, NodeSchedulingPolicy::Filling) {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
format!("Node {node_id} has no fill in progress").into(),
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
|
||||
if let Operation::Fill(fill) = op_handler.operation {
|
||||
if fill.node_id == node_id {
|
||||
tracing::info!("Cancelling background drain operation for node {node_id}");
|
||||
op_handler.cancel.cancel();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(ApiError::PreconditionFailed(
|
||||
format!("Node {node_id} has no fill in progress").into(),
|
||||
))
|
||||
}
|
||||
|
||||
/// Helper for methods that will try and call pageserver APIs for
|
||||
/// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
|
||||
/// is attached somewhere.
|
||||
@@ -5286,7 +5368,21 @@ impl Service {
|
||||
|
||||
while !inspected_all_shards {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(OperationError::Cancelled);
|
||||
match self
|
||||
.node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
|
||||
.await
|
||||
{
|
||||
Ok(()) => return Err(OperationError::Cancelled),
|
||||
Err(err) => {
|
||||
return Err(OperationError::FinalizeError(
|
||||
format!(
|
||||
"Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
|
||||
node_id, err
|
||||
)
|
||||
.into(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
@@ -5356,9 +5452,29 @@ impl Service {
|
||||
waiters = self
|
||||
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
|
||||
.await;
|
||||
|
||||
failpoint_support::sleep_millis_async!("sleepy-drain-loop");
|
||||
}
|
||||
|
||||
while !waiters.is_empty() {
|
||||
if cancel.is_cancelled() {
|
||||
match self
|
||||
.node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
|
||||
.await
|
||||
{
|
||||
Ok(()) => return Err(OperationError::Cancelled),
|
||||
Err(err) => {
|
||||
return Err(OperationError::FinalizeError(
|
||||
format!(
|
||||
"Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
|
||||
node_id, err
|
||||
)
|
||||
.into(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Awaiting {} pending drain reconciliations", waiters.len());
|
||||
|
||||
waiters = self
|
||||
@@ -5495,7 +5611,21 @@ impl Service {
|
||||
// we validate to ensure that it has not gone stale in the meantime.
|
||||
while !tids_to_promote.is_empty() {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(OperationError::Cancelled);
|
||||
match self
|
||||
.node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
|
||||
.await
|
||||
{
|
||||
Ok(()) => return Err(OperationError::Cancelled),
|
||||
Err(err) => {
|
||||
return Err(OperationError::FinalizeError(
|
||||
format!(
|
||||
"Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
|
||||
node_id, err
|
||||
)
|
||||
.into(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
@@ -5563,6 +5693,24 @@ impl Service {
|
||||
}
|
||||
|
||||
while !waiters.is_empty() {
|
||||
if cancel.is_cancelled() {
|
||||
match self
|
||||
.node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
|
||||
.await
|
||||
{
|
||||
Ok(()) => return Err(OperationError::Cancelled),
|
||||
Err(err) => {
|
||||
return Err(OperationError::FinalizeError(
|
||||
format!(
|
||||
"Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
|
||||
node_id, err
|
||||
)
|
||||
.into(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Awaiting {} pending fill reconciliations", waiters.len());
|
||||
|
||||
waiters = self
|
||||
|
||||
Reference in New Issue
Block a user