From 8ae6f656a694a2d6892ce6ebd1475d1b831ba917 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 3 Feb 2025 05:11:06 +0100 Subject: [PATCH] Don't require partial backup semaphore capacity for deletions (#10628) In the safekeeper, we block deletions on the timeline's gate closing, and any `WalResidentTimeline` keeps the gate open (because it owns a gate lock object). Thus, unless the `main_task` function of a partial backup doesn't return, we can't delete the associated timeline. In order to make these tasks exit early, we call the cancellation token of the timeline upon its shutdown. However, the partial backup task wasn't looking for the cancellation while waiting to acquire a partial backup permit. On a staging safekeeper we have been in a situation in the past where the semaphore was already empty for a duration of many hours, rendering all attempted deletions unable to proceed until a restart where the semaphore was reset: https://neondb.slack.com/archives/C03H1K0PGKH/p1738416586442029 --- safekeeper/src/wal_backup_partial.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 4e5b34a9bf..5ecb23e8e0 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -535,6 +535,10 @@ pub async fn main_task( // limit concurrent uploads let _upload_permit = tokio::select! { acq = limiter.acquire_partial_backup() => acq, + _ = backup.tli.cancel.cancelled() => { + info!("timeline canceled"); + return None; + } _ = cancel.cancelled() => { info!("task canceled"); return None;