mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-09 06:22:57 +00:00
storcon: respect Reconciler::cancel during await_lsn (#9486)
## Problem When a pageserver is misbehaving (e.g. we hit an ingest bug or something is pathologically slow), the storage controller could get stuck in the part of live migration that waits for LSNs to catch up. This is a problem, because it can prevent us migrating the troublesome tenant to another pageserver. Closes: https://github.com/neondatabase/cloud/issues/19169 ## Summary of changes - Respect Reconciler::cancel during await_lsn.
This commit is contained in:
@@ -450,6 +450,9 @@ impl Reconciler {
|
||||
}
|
||||
}
|
||||
|
||||
/// This function does _not_ mutate any state, so it is cancellation safe.
|
||||
///
|
||||
/// This function does not respect [`Self::cancel`], callers should handle that.
|
||||
async fn await_lsn(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -570,8 +573,10 @@ impl Reconciler {
|
||||
|
||||
if let Some(baseline) = baseline_lsns {
|
||||
tracing::info!("🕑 Waiting for LSN to catch up...");
|
||||
self.await_lsn(self.tenant_shard_id, &dest_ps, baseline)
|
||||
.await?;
|
||||
tokio::select! {
|
||||
r = self.await_lsn(self.tenant_shard_id, &dest_ps, baseline) => {r?;}
|
||||
_ = self.cancel.cancelled() => {return Err(ReconcileError::Cancel)}
|
||||
};
|
||||
}
|
||||
|
||||
tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}");
|
||||
|
||||
Reference in New Issue
Block a user