pageserver: tighten compaction failure detection (#10502)

## Problem If compaction fails, we disable L0 flush stalls to avoid persistent stalls. However, the logic would unset the failure marker on offload failures or shutdown. This can lead to sudden L0 flush stalls if we try and fail to offload a timeline with compaction failures, or if there is some kind of shutdown race. Touches #10405. ## Summary of changes Don't touch the compaction failure marker on offload failures or shutdown.
2025-12-22 21:59:59 +00:00 · 2025-01-24 14:55:05 +01:00
parent d8ab6ddb0f
commit ef2a2555b1
1 changed files with 9 additions and 7 deletions
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1704,14 +1704,16 @@ impl Timeline {
        };

        // Signal compaction failure to avoid L0 flush stalls when it's broken.
-        let compaction_failed = match result {
-            Ok(_) => false,
-            Err(CompactionError::Offload(_)) => false, // doesn't halt compaction
-            Err(CompactionError::ShuttingDown) => false, // not a failure
-            Err(CompactionError::Other(_)) => true,
+        match result {
+            Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
+            Err(CompactionError::Other(_)) => {
+                self.compaction_failed.store(true, AtomicOrdering::Relaxed)
+            }
+            // Don't change the current value on offload failure or shutdown. We don't want to
+            // abruptly stall nor resume L0 flushes in these cases.
+            Err(CompactionError::Offload(_)) => {}
+            Err(CompactionError::ShuttingDown) => {}
        };
-        self.compaction_failed
-            .store(compaction_failed, AtomicOrdering::Relaxed);

        result
    }