mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-31 03:50:37 +00:00
storcon: add an API to cancel ongoing reconciler (#9520)
## Problem If something goes wrong with a live migration, we currently only have awkward ways to interrupt that: - Restart the storage controller - Ask it to do some other modification/migration on the shard, which we don't really want. ## Summary of changes - Add a new `/cancel` control API, and storcon_cli wrapper for it, which fires the Reconciler's cancellation token. This is just for on-call use and we do not expect it to be used by any other services.
This commit is contained in:
@@ -968,6 +968,28 @@ async fn handle_tenant_shard_migrate(
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_cancel_reconcile(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let req = match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
}
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
service
|
||||
.tenant_shard_cancel_reconcile(tenant_shard_id)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -1776,6 +1798,16 @@ pub fn make_router(
|
||||
RequestName("control_v1_tenant_migrate"),
|
||||
)
|
||||
})
|
||||
.put(
|
||||
"/control/v1/tenant/:tenant_shard_id/cancel_reconcile",
|
||||
|r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_shard_cancel_reconcile,
|
||||
RequestName("control_v1_tenant_cancel_reconcile"),
|
||||
)
|
||||
},
|
||||
)
|
||||
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
|
||||
@@ -4834,6 +4834,43 @@ impl Service {
|
||||
Ok(TenantShardMigrateResponse {})
|
||||
}
|
||||
|
||||
/// 'cancel' in this context means cancel any ongoing reconcile
|
||||
pub(crate) async fn tenant_shard_cancel_reconcile(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<(), ApiError> {
|
||||
// Take state lock and fire the cancellation token, after which we drop lock and wait for any ongoing reconcile to complete
|
||||
let waiter = {
|
||||
let locked = self.inner.write().unwrap();
|
||||
let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant shard not found").into(),
|
||||
));
|
||||
};
|
||||
|
||||
let waiter = shard.get_waiter();
|
||||
match waiter {
|
||||
None => {
|
||||
tracing::info!("Shard does not have an ongoing Reconciler");
|
||||
return Ok(());
|
||||
}
|
||||
Some(waiter) => {
|
||||
tracing::info!("Cancelling Reconciler");
|
||||
shard.cancel_reconciler();
|
||||
waiter
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Cancellation should be prompt. If this fails we have still done our job of firing the
|
||||
// cancellation token, but by returning an ApiError we will indicate to the caller that
|
||||
// the Reconciler is misbehaving and not respecting the cancellation token
|
||||
self.await_waiters(vec![waiter], SHORT_RECONCILE_TIMEOUT)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is for debug/support only: we simply drop all state for a tenant, without
|
||||
/// detaching or deleting it on pageservers.
|
||||
pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> {
|
||||
|
||||
@@ -1317,6 +1317,12 @@ impl TenantShard {
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn cancel_reconciler(&self) {
|
||||
if let Some(handle) = self.reconciler.as_ref() {
|
||||
handle.cancel.cancel()
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a waiter for any reconciliation in flight, but do not start reconciliation
|
||||
/// if it is not already running
|
||||
pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
|
||||
|
||||
Reference in New Issue
Block a user