mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-16 18:02:56 +00:00
pageserver: add enforce_circuit_breakers tenant config
This commit is contained in:
@@ -400,6 +400,11 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'lazy_slru_download' as bool")?,
|
||||
enforce_circuit_breakers: settings
|
||||
.remove("enforce_circuit_breakers")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'enforce_circuit_breakers' as bool")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
@@ -505,6 +510,11 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'lazy_slru_download' as bool")?,
|
||||
enforce_circuit_breakers: settings
|
||||
.remove("enforce_circuit_breakers")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'enforce_circuit_breakers' as bool")?,
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -283,6 +283,7 @@ pub struct TenantConfig {
|
||||
pub gc_feedback: Option<bool>,
|
||||
pub heatmap_period: Option<String>,
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
pub enforce_circuit_breakers: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
|
||||
@@ -1971,14 +1971,19 @@ impl Tenant {
|
||||
&& amplification > PATHOLOGICAL_AMPLIFICATION_FACTOR
|
||||
{
|
||||
tracing::error!("Pathological storage amplification detected (synthetic size {synthetic_size}, physical size {total_physical}): shutting down ingest");
|
||||
for (timeline_id, timeline) in timelines_to_compact {
|
||||
if tokio::time::timeout(Duration::from_secs(5), timeline.kill_wal_receiver())
|
||||
if self.get_enforce_circuit_breakers() {
|
||||
for (timeline_id, timeline) in timelines_to_compact {
|
||||
if tokio::time::timeout(
|
||||
Duration::from_secs(5),
|
||||
timeline.kill_wal_receiver(),
|
||||
)
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
tracing::error!(
|
||||
"Timed out shutting down WAL intest on timeline {timeline_id}"
|
||||
);
|
||||
{
|
||||
tracing::error!(
|
||||
"Timed out shutting down WAL intest on timeline {timeline_id}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2634,6 +2639,16 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_enforce_circuit_breakers(&self) -> bool {
|
||||
let tenant_conf = self
|
||||
.tenant_conf
|
||||
.read()
|
||||
.unwrap()
|
||||
.tenant_conf
|
||||
.enforce_circuit_breakers;
|
||||
tenant_conf.unwrap_or(self.conf.default_tenant_conf.enforce_circuit_breakers)
|
||||
}
|
||||
|
||||
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
|
||||
self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
|
||||
// Don't hold self.timelines.lock() during the notifies.
|
||||
@@ -4003,6 +4018,7 @@ pub(crate) mod harness {
|
||||
gc_feedback: Some(tenant_conf.gc_feedback),
|
||||
heatmap_period: Some(tenant_conf.heatmap_period),
|
||||
lazy_slru_download: Some(tenant_conf.lazy_slru_download),
|
||||
enforce_circuit_breakers: Some(tenant_conf.enforce_circuit_breakers),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,6 +49,8 @@ pub mod defaults {
|
||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
|
||||
pub const DEFAULT_ENFORCE_CIRCUIT_BREAKERS: bool = false;
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
@@ -348,6 +350,10 @@ pub struct TenantConf {
|
||||
|
||||
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
|
||||
pub lazy_slru_download: bool,
|
||||
|
||||
/// If true, then the tenant will automatically shut off external APIs (e.g. wal ingest, page service) in
|
||||
/// response to high failure rates that likely indicate a bug.
|
||||
pub enforce_circuit_breakers: bool,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -437,6 +443,10 @@ pub struct TenantConfOpt {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub enforce_circuit_breakers: Option<bool>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -485,6 +495,9 @@ impl TenantConfOpt {
|
||||
lazy_slru_download: self
|
||||
.lazy_slru_download
|
||||
.unwrap_or(global_conf.lazy_slru_download),
|
||||
enforce_circuit_breakers: self
|
||||
.enforce_circuit_breakers
|
||||
.unwrap_or(global_conf.enforce_circuit_breakers),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -524,6 +537,7 @@ impl Default for TenantConf {
|
||||
gc_feedback: false,
|
||||
heatmap_period: Duration::ZERO,
|
||||
lazy_slru_download: false,
|
||||
enforce_circuit_breakers: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -596,6 +610,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
gc_feedback: value.gc_feedback,
|
||||
heatmap_period: value.heatmap_period.map(humantime),
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
enforce_circuit_breakers: value.enforce_circuit_breakers,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -174,6 +174,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
|
||||
"pitr_interval": "1m",
|
||||
"lagging_wal_timeout": "23m",
|
||||
"lazy_slru_download": True,
|
||||
"enforce_circuit_breakers": True,
|
||||
"max_lsn_wal_lag": 230000,
|
||||
"min_resident_size_override": 23,
|
||||
"trace_read_requests": True,
|
||||
|
||||
Reference in New Issue
Block a user