diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index a1b0ba4252..22a7af84fa 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -400,6 +400,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'lazy_slru_download' as bool")?, + enforce_circuit_breakers: settings + .remove("enforce_circuit_breakers") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'enforce_circuit_breakers' as bool")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -505,6 +510,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'lazy_slru_download' as bool")?, + enforce_circuit_breakers: settings + .remove("enforce_circuit_breakers") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'enforce_circuit_breakers' as bool")?, } }; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 46324efd43..c3d19fade6 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -283,6 +283,7 @@ pub struct TenantConfig { pub gc_feedback: Option, pub heatmap_period: Option, pub lazy_slru_download: Option, + pub enforce_circuit_breakers: Option, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e688bff8cf..e870dcbd5d 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1971,14 +1971,19 @@ impl Tenant { && amplification > PATHOLOGICAL_AMPLIFICATION_FACTOR { tracing::error!("Pathological storage amplification detected (synthetic size {synthetic_size}, physical size {total_physical}): shutting down ingest"); - for (timeline_id, timeline) in timelines_to_compact { - if tokio::time::timeout(Duration::from_secs(5), timeline.kill_wal_receiver()) + if self.get_enforce_circuit_breakers() { + for (timeline_id, timeline) in timelines_to_compact { + if tokio::time::timeout( + Duration::from_secs(5), + timeline.kill_wal_receiver(), + ) .await .is_err() - { - tracing::error!( - "Timed out shutting down WAL intest on timeline {timeline_id}" - ); + { + tracing::error!( + "Timed out shutting down WAL intest on timeline {timeline_id}" + ); + } } } } @@ -2634,6 +2639,16 @@ impl Tenant { } } + pub(crate) fn get_enforce_circuit_breakers(&self) -> bool { + let tenant_conf = self + .tenant_conf + .read() + .unwrap() + .tenant_conf + .enforce_circuit_breakers; + tenant_conf.unwrap_or(self.conf.default_tenant_conf.enforce_circuit_breakers) + } + pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf; // Don't hold self.timelines.lock() during the notifies. @@ -4003,6 +4018,7 @@ pub(crate) mod harness { gc_feedback: Some(tenant_conf.gc_feedback), heatmap_period: Some(tenant_conf.heatmap_period), lazy_slru_download: Some(tenant_conf.lazy_slru_download), + enforce_circuit_breakers: Some(tenant_conf.enforce_circuit_breakers), } } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 563887088d..7170df4edc 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -49,6 +49,8 @@ pub mod defaults { pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; + + pub const DEFAULT_ENFORCE_CIRCUIT_BREAKERS: bool = false; } #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] @@ -348,6 +350,10 @@ pub struct TenantConf { /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup pub lazy_slru_download: bool, + + /// If true, then the tenant will automatically shut off external APIs (e.g. wal ingest, page service) in + /// response to high failure rates that likely indicate a bug. + pub enforce_circuit_breakers: bool, } /// Same as TenantConf, but this struct preserves the information about @@ -437,6 +443,10 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub lazy_slru_download: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub enforce_circuit_breakers: Option, } impl TenantConfOpt { @@ -485,6 +495,9 @@ impl TenantConfOpt { lazy_slru_download: self .lazy_slru_download .unwrap_or(global_conf.lazy_slru_download), + enforce_circuit_breakers: self + .enforce_circuit_breakers + .unwrap_or(global_conf.enforce_circuit_breakers), } } } @@ -524,6 +537,7 @@ impl Default for TenantConf { gc_feedback: false, heatmap_period: Duration::ZERO, lazy_slru_download: false, + enforce_circuit_breakers: false, } } } @@ -596,6 +610,7 @@ impl From for models::TenantConfig { gc_feedback: value.gc_feedback, heatmap_period: value.heatmap_period.map(humantime), lazy_slru_download: value.lazy_slru_download, + enforce_circuit_breakers: value.enforce_circuit_breakers, } } } diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 7cdc314658..f38446db1f 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -174,6 +174,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "pitr_interval": "1m", "lagging_wal_timeout": "23m", "lazy_slru_download": True, + "enforce_circuit_breakers": True, "max_lsn_wal_lag": 230000, "min_resident_size_override": 23, "trace_read_requests": True,