From ef2a2555b1c001ee6af9bf4bc9896aa6bd09b9ed Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 24 Jan 2025 14:55:05 +0100 Subject: [PATCH] pageserver: tighten compaction failure detection (#10502) ## Problem If compaction fails, we disable L0 flush stalls to avoid persistent stalls. However, the logic would unset the failure marker on offload failures or shutdown. This can lead to sudden L0 flush stalls if we try and fail to offload a timeline with compaction failures, or if there is some kind of shutdown race. Touches #10405. ## Summary of changes Don't touch the compaction failure marker on offload failures or shutdown. --- pageserver/src/tenant/timeline.rs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index fffa2c8e2b..ee43512501 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1704,14 +1704,16 @@ impl Timeline { }; // Signal compaction failure to avoid L0 flush stalls when it's broken. - let compaction_failed = match result { - Ok(_) => false, - Err(CompactionError::Offload(_)) => false, // doesn't halt compaction - Err(CompactionError::ShuttingDown) => false, // not a failure - Err(CompactionError::Other(_)) => true, + match result { + Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed), + Err(CompactionError::Other(_)) => { + self.compaction_failed.store(true, AtomicOrdering::Relaxed) + } + // Don't change the current value on offload failure or shutdown. We don't want to + // abruptly stall nor resume L0 flushes in these cases. + Err(CompactionError::Offload(_)) => {} + Err(CompactionError::ShuttingDown) => {} }; - self.compaction_failed - .store(compaction_failed, AtomicOrdering::Relaxed); result }