mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 22:12:56 +00:00
fix(pageserver): deadlock in gc-compaction (#8590)
We need both compaction and gc lock for gc-compaction. The lock order should be the same everywhere, otherwise there could be a deadlock where A waits for B and B waits for A. We also had a double-lock issue. The compaction lock gets acquired in the outer `compact` function. Note that the unit tests directly call `compact_with_gc`, and therefore not triggering the issue. ## Summary of changes Ensure all places acquire compact lock and then gc lock. Remove an extra compact lock acqusition. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
@@ -1646,19 +1646,23 @@ impl Timeline {
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
// Block other compaction/GC tasks from running for now. GC-compaction could run along
|
||||
// with legacy compaction tasks in the future.
|
||||
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
|
||||
// Note that we already acquired the compaction lock when the outer `compact` function gets called.
|
||||
|
||||
let _compaction_lock = tokio::select! {
|
||||
guard = self.compaction_lock.lock() => guard,
|
||||
// TODO: refactor to CompactionError to correctly pass cancelled error
|
||||
_ = cancel.cancelled() => return Err(anyhow!("cancelled")),
|
||||
let gc_lock = async {
|
||||
tokio::select! {
|
||||
guard = self.gc_lock.lock() => Ok(guard),
|
||||
// TODO: refactor to CompactionError to correctly pass cancelled error
|
||||
_ = cancel.cancelled() => Err(anyhow!("cancelled")),
|
||||
}
|
||||
};
|
||||
|
||||
let _gc = tokio::select! {
|
||||
guard = self.gc_lock.lock() => guard,
|
||||
// TODO: refactor to CompactionError to correctly pass cancelled error
|
||||
_ = cancel.cancelled() => return Err(anyhow!("cancelled")),
|
||||
};
|
||||
let gc_lock = crate::timed(
|
||||
gc_lock,
|
||||
"acquires gc lock",
|
||||
std::time::Duration::from_secs(5),
|
||||
)
|
||||
.await?;
|
||||
|
||||
info!("running enhanced gc bottom-most compaction");
|
||||
|
||||
@@ -2063,9 +2067,11 @@ impl Timeline {
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
|
||||
};
|
||||
|
||||
self.remote_client
|
||||
.schedule_compaction_update(&layer_selection, &compact_to)?;
|
||||
|
||||
drop(gc_lock);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,10 +63,19 @@ pub(super) async fn delete_local_timeline_directory(
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline: &Timeline,
|
||||
) -> anyhow::Result<()> {
|
||||
let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
|
||||
let guards = crate::timed(
|
||||
guards,
|
||||
"acquire gc and compaction locks",
|
||||
// Always ensure the lock order is compaction -> gc.
|
||||
let compaction_lock = timeline.compaction_lock.lock();
|
||||
let compaction_lock = crate::timed(
|
||||
compaction_lock,
|
||||
"acquires compaction lock",
|
||||
std::time::Duration::from_secs(5),
|
||||
)
|
||||
.await;
|
||||
|
||||
let gc_lock = timeline.gc_lock.lock();
|
||||
let gc_lock = crate::timed(
|
||||
gc_lock,
|
||||
"acquires gc lock",
|
||||
std::time::Duration::from_secs(5),
|
||||
)
|
||||
.await;
|
||||
@@ -107,7 +116,8 @@ pub(super) async fn delete_local_timeline_directory(
|
||||
.context("fsync_pre_mark_remove")?;
|
||||
|
||||
info!("finished deleting layer files, releasing locks");
|
||||
drop(guards);
|
||||
drop(gc_lock);
|
||||
drop(compaction_lock);
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
||||
|
||||
Reference in New Issue
Block a user