mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
pageserver: tighten compaction failure detection (#10502)
## Problem If compaction fails, we disable L0 flush stalls to avoid persistent stalls. However, the logic would unset the failure marker on offload failures or shutdown. This can lead to sudden L0 flush stalls if we try and fail to offload a timeline with compaction failures, or if there is some kind of shutdown race. Touches #10405. ## Summary of changes Don't touch the compaction failure marker on offload failures or shutdown.
This commit is contained in:
@@ -1704,14 +1704,16 @@ impl Timeline {
|
||||
};
|
||||
|
||||
// Signal compaction failure to avoid L0 flush stalls when it's broken.
|
||||
let compaction_failed = match result {
|
||||
Ok(_) => false,
|
||||
Err(CompactionError::Offload(_)) => false, // doesn't halt compaction
|
||||
Err(CompactionError::ShuttingDown) => false, // not a failure
|
||||
Err(CompactionError::Other(_)) => true,
|
||||
match result {
|
||||
Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
|
||||
Err(CompactionError::Other(_)) => {
|
||||
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
|
||||
}
|
||||
// Don't change the current value on offload failure or shutdown. We don't want to
|
||||
// abruptly stall nor resume L0 flushes in these cases.
|
||||
Err(CompactionError::Offload(_)) => {}
|
||||
Err(CompactionError::ShuttingDown) => {}
|
||||
};
|
||||
self.compaction_failed
|
||||
.store(compaction_failed, AtomicOrdering::Relaxed);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user