From 2183290939da98fe2c400feacb6b467f863ad141 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 13 Feb 2024 14:17:10 +0000 Subject: [PATCH] pageserver: apply 5 fails -> 1hr wait circuit breaker to compaction --- pageserver/src/tenant.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e870dcbd5d..af1d547bff 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -34,6 +34,7 @@ use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; use utils::backoff; +use utils::circuit_breaker::CircuitBreaker; use utils::completion; use utils::crashsafe::path_with_suffix_extension; use utils::failpoint_support; @@ -293,6 +294,10 @@ pub struct Tenant { eviction_task_tenant_state: tokio::sync::Mutex, + /// Track repeated failures to compact, so that we can back off. + /// Overhead of mutex is acceptable because compaction is done with a multi-second period. + compaction_circuit_breaker: std::sync::Mutex, + /// If the tenant is in Activating state, notify this to encourage it /// to proceed to Active as soon as possible, rather than waiting for lazy /// background warmup. @@ -1936,6 +1941,11 @@ impl Tenant { timelines_to_compact }; + // Before doing any I/O work, check our circuit breaker + if self.compaction_circuit_breaker.lock().unwrap().is_broken() { + return Ok(()); + } + let mut total_physical = 0; for (timeline_id, timeline) in &timelines_to_compact { let timeline_result = timeline @@ -1947,9 +1957,15 @@ impl Tenant { total_physical += remote_client.get_remote_physical_size(); } + if timeline_result.is_err() { + self.compaction_circuit_breaker.lock().unwrap().fail(); + } + timeline_result?; } + self.compaction_circuit_breaker.lock().unwrap().success(); + // Circuit breaker: if a timeline's statistics indicate a pathological storage issue, such // as extremely high write inflation, then we will stop ingesting data for that timeline. This // reduces the blast radius of postgres/walingest bugs that might enable one tenant to generate @@ -2808,6 +2824,10 @@ impl Tenant { cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()), cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)), eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()), + compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new( + 5, + Some(Duration::from_secs(3600)), + )), activate_now_sem: tokio::sync::Semaphore::new(0), delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())), cancel: CancellationToken::default(),