mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-17 02:12:56 +00:00
pageserver: apply 5 fails -> 1hr wait circuit breaker to compaction
This commit is contained in:
@@ -34,6 +34,7 @@ use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::backoff;
|
||||
use utils::circuit_breaker::CircuitBreaker;
|
||||
use utils::completion;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::failpoint_support;
|
||||
@@ -293,6 +294,10 @@ pub struct Tenant {
|
||||
|
||||
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
||||
|
||||
/// Track repeated failures to compact, so that we can back off.
|
||||
/// Overhead of mutex is acceptable because compaction is done with a multi-second period.
|
||||
compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
|
||||
|
||||
/// If the tenant is in Activating state, notify this to encourage it
|
||||
/// to proceed to Active as soon as possible, rather than waiting for lazy
|
||||
/// background warmup.
|
||||
@@ -1936,6 +1941,11 @@ impl Tenant {
|
||||
timelines_to_compact
|
||||
};
|
||||
|
||||
// Before doing any I/O work, check our circuit breaker
|
||||
if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut total_physical = 0;
|
||||
for (timeline_id, timeline) in &timelines_to_compact {
|
||||
let timeline_result = timeline
|
||||
@@ -1947,9 +1957,15 @@ impl Tenant {
|
||||
total_physical += remote_client.get_remote_physical_size();
|
||||
}
|
||||
|
||||
if timeline_result.is_err() {
|
||||
self.compaction_circuit_breaker.lock().unwrap().fail();
|
||||
}
|
||||
|
||||
timeline_result?;
|
||||
}
|
||||
|
||||
self.compaction_circuit_breaker.lock().unwrap().success();
|
||||
|
||||
// Circuit breaker: if a timeline's statistics indicate a pathological storage issue, such
|
||||
// as extremely high write inflation, then we will stop ingesting data for that timeline. This
|
||||
// reduces the blast radius of postgres/walingest bugs that might enable one tenant to generate
|
||||
@@ -2808,6 +2824,10 @@ impl Tenant {
|
||||
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
||||
compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new(
|
||||
5,
|
||||
Some(Duration::from_secs(3600)),
|
||||
)),
|
||||
activate_now_sem: tokio::sync::Semaphore::new(0),
|
||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
|
||||
cancel: CancellationToken::default(),
|
||||
|
||||
Reference in New Issue
Block a user