mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 14:00:38 +00:00
pageserver: fix race that can wedge background tasks (#11047)
## Problem `wait_for_active_tenant()`, used when starting background tasks, has a race condition that can cause it to wait forever (until cancelled). It first checks the current tenant state, and then subscribes for state updates, but if the state changes between these then it won't be notified about it. We've seen this wedge compaction tasks, which can cause unbounded layer file buildup and read amplification. ## Summary of changes Use `watch::Receiver::wait_for()` to check both the current and new tenant states.
This commit is contained in:
@@ -473,21 +473,15 @@ async fn wait_for_active_tenant(
|
||||
}
|
||||
|
||||
let mut update_rx = tenant.subscribe_for_state_updates();
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => return ControlFlow::Break(()),
|
||||
result = update_rx.changed() => if result.is_err() {
|
||||
tokio::select! {
|
||||
result = update_rx.wait_for(|s| s == &TenantState::Active) => {
|
||||
if result.is_err() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
}
|
||||
|
||||
match &*update_rx.borrow() {
|
||||
TenantState::Active => {
|
||||
debug!("Tenant state changed to active, continuing the task loop");
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
state => debug!("Not running the task loop, tenant is not active: {state:?}"),
|
||||
}
|
||||
debug!("Tenant state changed to active, continuing the task loop");
|
||||
ControlFlow::Continue(())
|
||||
},
|
||||
_ = cancel.cancelled() => ControlFlow::Break(()),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user