storcon: add an API to cancel ongoing reconciler (#9520)

## Problem

If something goes wrong with a live migration, we currently only have
awkward ways to interrupt that:
- Restart the storage controller
- Ask it to do some other modification/migration on the shard, which we
don't really want.

## Summary of changes

- Add a new `/cancel` control API, and storcon_cli wrapper for it, which
fires the Reconciler's cancellation token. This is just for on-call use
and we do not expect it to be used by any other services.
This commit is contained in:
John Spray
2024-10-28 09:26:01 +00:00
committed by GitHub
parent 923974d4da
commit 33baca07b6
5 changed files with 103 additions and 1 deletions

View File

@@ -111,6 +111,11 @@ enum Command {
#[arg(long)]
node: NodeId,
},
/// Cancel any ongoing reconciliation for this shard
TenantShardCancelReconcile {
#[arg(long)]
tenant_shard_id: TenantShardId,
},
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
/// that is passed through to pageservers, and does not affect storage controller behavior.
TenantConfig {
@@ -535,6 +540,15 @@ async fn main() -> anyhow::Result<()> {
)
.await?;
}
Command::TenantShardCancelReconcile { tenant_shard_id } => {
storcon_client
.dispatch::<(), ()>(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/cancel_reconcile"),
None,
)
.await?;
}
Command::TenantConfig { tenant_id, config } => {
let tenant_conf = serde_json::from_str(&config)?;

View File

@@ -968,6 +968,28 @@ async fn handle_tenant_shard_migrate(
)
}
async fn handle_tenant_shard_cancel_reconcile(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let req = match maybe_forward(req).await {
ForwardOutcome::Forwarded(res) => {
return res;
}
ForwardOutcome::NotForwarded(req) => req,
};
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
json_response(
StatusCode::OK,
service
.tenant_shard_cancel_reconcile(tenant_shard_id)
.await?,
)
}
async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
@@ -1776,6 +1798,16 @@ pub fn make_router(
RequestName("control_v1_tenant_migrate"),
)
})
.put(
"/control/v1/tenant/:tenant_shard_id/cancel_reconcile",
|r| {
tenant_service_handler(
r,
handle_tenant_shard_cancel_reconcile,
RequestName("control_v1_tenant_cancel_reconcile"),
)
},
)
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
tenant_service_handler(
r,

View File

@@ -4834,6 +4834,43 @@ impl Service {
Ok(TenantShardMigrateResponse {})
}
/// 'cancel' in this context means cancel any ongoing reconcile
pub(crate) async fn tenant_shard_cancel_reconcile(
&self,
tenant_shard_id: TenantShardId,
) -> Result<(), ApiError> {
// Take state lock and fire the cancellation token, after which we drop lock and wait for any ongoing reconcile to complete
let waiter = {
let locked = self.inner.write().unwrap();
let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant shard not found").into(),
));
};
let waiter = shard.get_waiter();
match waiter {
None => {
tracing::info!("Shard does not have an ongoing Reconciler");
return Ok(());
}
Some(waiter) => {
tracing::info!("Cancelling Reconciler");
shard.cancel_reconciler();
waiter
}
}
};
// Cancellation should be prompt. If this fails we have still done our job of firing the
// cancellation token, but by returning an ApiError we will indicate to the caller that
// the Reconciler is misbehaving and not respecting the cancellation token
self.await_waiters(vec![waiter], SHORT_RECONCILE_TIMEOUT)
.await?;
Ok(())
}
/// This is for debug/support only: we simply drop all state for a tenant, without
/// detaching or deleting it on pageservers.
pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> {

View File

@@ -1317,6 +1317,12 @@ impl TenantShard {
})
}
pub(crate) fn cancel_reconciler(&self) {
if let Some(handle) = self.reconciler.as_ref() {
handle.cancel.cancel()
}
}
/// Get a waiter for any reconciliation in flight, but do not start reconciliation
/// if it is not already running
pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {

View File

@@ -872,6 +872,14 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
assert all(v["may_schedule"] for v in response.json()["nodes"].values())
# Reconciler cancel API should be a no-op when nothing is in flight
env.storage_controller.request(
"PUT",
f"{env.storage_controller_api}/control/v1/tenant/{tenant_id}-0102/cancel_reconcile",
headers=env.storage_controller.headers(TokenScope.ADMIN),
)
# Node unclean drop API
response = env.storage_controller.request(
"POST",
f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
@@ -879,6 +887,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
)
assert len(env.storage_controller.node_list()) == 1
# Tenant unclean drop API
response = env.storage_controller.request(
"POST",
f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop",
@@ -892,7 +901,6 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
headers=env.storage_controller.headers(TokenScope.ADMIN),
)
assert len(response.json()) == 1
# Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
# meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
env.storage_controller.consistency_check()
@@ -1660,6 +1668,11 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
assert "Stop" in storcon_cli(["tenants"])[3]
# Cancel ongoing reconcile on a tenant
storcon_cli(
["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{env.initial_tenant}-0104"]
)
# Change a tenant's placement
storcon_cli(
["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]