From d857f63e3b1be9b1b70ef3f8c64cb088d126743c Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 28 Feb 2025 18:00:22 +0100 Subject: [PATCH] pageserver: fix race that can wedge background tasks (#11047) ## Problem `wait_for_active_tenant()`, used when starting background tasks, has a race condition that can cause it to wait forever (until cancelled). It first checks the current tenant state, and then subscribes for state updates, but if the state changes between these then it won't be notified about it. We've seen this wedge compaction tasks, which can cause unbounded layer file buildup and read amplification. ## Summary of changes Use `watch::Receiver::wait_for()` to check both the current and new tenant states. --- pageserver/src/tenant/tasks.rs | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index c90f81889b..589ac5ae88 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -473,21 +473,15 @@ async fn wait_for_active_tenant( } let mut update_rx = tenant.subscribe_for_state_updates(); - loop { - tokio::select! { - _ = cancel.cancelled() => return ControlFlow::Break(()), - result = update_rx.changed() => if result.is_err() { + tokio::select! { + result = update_rx.wait_for(|s| s == &TenantState::Active) => { + if result.is_err() { return ControlFlow::Break(()); } - } - - match &*update_rx.borrow() { - TenantState::Active => { - debug!("Tenant state changed to active, continuing the task loop"); - return ControlFlow::Continue(()); - } - state => debug!("Not running the task loop, tenant is not active: {state:?}"), - } + debug!("Tenant state changed to active, continuing the task loop"); + ControlFlow::Continue(()) + }, + _ = cancel.cancelled() => ControlFlow::Break(()), } }