From d589498c6f556d6c1b246b00800fe51ece416485 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 24 Oct 2024 15:23:09 +0100 Subject: [PATCH] storcon: respect Reconciler::cancel during await_lsn (#9486) ## Problem When a pageserver is misbehaving (e.g. we hit an ingest bug or something is pathologically slow), the storage controller could get stuck in the part of live migration that waits for LSNs to catch up. This is a problem, because it can prevent us migrating the troublesome tenant to another pageserver. Closes: https://github.com/neondatabase/cloud/issues/19169 ## Summary of changes - Respect Reconciler::cancel during await_lsn. --- storage_controller/src/reconciler.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 9d2182d44c..3ad386a95b 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -450,6 +450,9 @@ impl Reconciler { } } + /// This function does _not_ mutate any state, so it is cancellation safe. + /// + /// This function does not respect [`Self::cancel`], callers should handle that. async fn await_lsn( &self, tenant_shard_id: TenantShardId, @@ -570,8 +573,10 @@ impl Reconciler { if let Some(baseline) = baseline_lsns { tracing::info!("🕑 Waiting for LSN to catch up..."); - self.await_lsn(self.tenant_shard_id, &dest_ps, baseline) - .await?; + tokio::select! { + r = self.await_lsn(self.tenant_shard_id, &dest_ps, baseline) => {r?;} + _ = self.cancel.cancelled() => {return Err(ReconcileError::Cancel)} + }; } tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}");