diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 61d662d25d..421f718ad6 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1646,19 +1646,23 @@ impl Timeline { use std::collections::BTreeSet; // Block other compaction/GC tasks from running for now. GC-compaction could run along - // with legacy compaction tasks in the future. + // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc. + // Note that we already acquired the compaction lock when the outer `compact` function gets called. - let _compaction_lock = tokio::select! { - guard = self.compaction_lock.lock() => guard, - // TODO: refactor to CompactionError to correctly pass cancelled error - _ = cancel.cancelled() => return Err(anyhow!("cancelled")), + let gc_lock = async { + tokio::select! { + guard = self.gc_lock.lock() => Ok(guard), + // TODO: refactor to CompactionError to correctly pass cancelled error + _ = cancel.cancelled() => Err(anyhow!("cancelled")), + } }; - let _gc = tokio::select! { - guard = self.gc_lock.lock() => guard, - // TODO: refactor to CompactionError to correctly pass cancelled error - _ = cancel.cancelled() => return Err(anyhow!("cancelled")), - }; + let gc_lock = crate::timed( + gc_lock, + "acquires gc lock", + std::time::Duration::from_secs(5), + ) + .await?; info!("running enhanced gc bottom-most compaction"); @@ -2063,9 +2067,11 @@ impl Timeline { let mut guard = self.layers.write().await; guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics) }; - self.remote_client .schedule_compaction_update(&layer_selection, &compact_to)?; + + drop(gc_lock); + Ok(()) } } diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 9b2403f899..05178c38b4 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -63,10 +63,19 @@ pub(super) async fn delete_local_timeline_directory( tenant_shard_id: TenantShardId, timeline: &Timeline, ) -> anyhow::Result<()> { - let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) }; - let guards = crate::timed( - guards, - "acquire gc and compaction locks", + // Always ensure the lock order is compaction -> gc. + let compaction_lock = timeline.compaction_lock.lock(); + let compaction_lock = crate::timed( + compaction_lock, + "acquires compaction lock", + std::time::Duration::from_secs(5), + ) + .await; + + let gc_lock = timeline.gc_lock.lock(); + let gc_lock = crate::timed( + gc_lock, + "acquires gc lock", std::time::Duration::from_secs(5), ) .await; @@ -107,7 +116,8 @@ pub(super) async fn delete_local_timeline_directory( .context("fsync_pre_mark_remove")?; info!("finished deleting layer files, releasing locks"); - drop(guards); + drop(gc_lock); + drop(compaction_lock); fail::fail_point!("timeline-delete-after-rm", |_| { Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?