mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 20:12:54 +00:00
fix deadlock around TENANTS (#5285)
The sequence that can lead to a deadlock: 1. DELETE request gets all the way to `tenant.shutdown(progress, false).await.is_err() ` , while holding TENANTS.read() 2. POST request for tenant creation comes in, calls `tenant_map_insert`, it does `let mut guard = TENANTS.write().await;` 3. Something that `tenant.shutdown()` needs to wait for needs a `TENANTS.read().await`. The only case identified in exhaustive manual scanning of the code base is this one: Imitate size access does `get_tenant().await`, which does `TENANTS.read().await` under the hood. In the above case (1) waits for (3), (3)'s read-lock request is queued behind (2)'s write-lock, and (2) waits for (1). Deadlock. I made a reproducer/proof-that-above-hypothesis-holds in https://github.com/neondatabase/neon/pull/5281 , but, it's not ready for merge yet and we want the fix _now_. fixes https://github.com/neondatabase/neon/issues/5284
This commit is contained in:
committed by
GitHub
parent
36c261851f
commit
5be8d38a63
@@ -503,6 +503,8 @@ pub enum GetTenantError {
|
||||
|
||||
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
|
||||
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
||||
///
|
||||
/// This method is cancel-safe.
|
||||
pub async fn get_tenant(
|
||||
tenant_id: TenantId,
|
||||
active_only: bool,
|
||||
|
||||
@@ -328,9 +328,24 @@ impl Timeline {
|
||||
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
||||
// The others wait until the calculation is done so that they take into account the
|
||||
// imitated accesses that the winner made.
|
||||
let Ok(tenant) = crate::tenant::mgr::get_tenant(self.tenant_id, true).await else {
|
||||
// likely, we're shutting down
|
||||
return ControlFlow::Break(());
|
||||
//
|
||||
// It is critical we are responsive to cancellation here. Otherwise, we deadlock with
|
||||
// tenant deletion (holds TENANTS in read mode) any other task that attempts to
|
||||
// acquire TENANTS in write mode before we here call get_tenant.
|
||||
// See https://github.com/neondatabase/neon/issues/5284.
|
||||
let res = tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => {
|
||||
res
|
||||
}
|
||||
};
|
||||
let tenant = match res {
|
||||
Ok(t) => t,
|
||||
Err(_) => {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
};
|
||||
let mut state = tenant.eviction_task_tenant_state.lock().await;
|
||||
match state.last_layer_access_imitation {
|
||||
|
||||
Reference in New Issue
Block a user