diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 52527ffa90..383c174684 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -347,6 +347,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_threshold' as an integer")?, + compaction_upper_limit: settings + .remove("compaction_upper_limit") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_upper_limit' as an integer")?, compaction_algorithm: settings .remove("compaction_algorithm") .map(serde_json::from_str) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 40c8837af5..422da0dc95 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -256,6 +256,11 @@ pub struct TenantConfigToml { pub compaction_period: Duration, /// Level0 delta layer threshold for compaction. pub compaction_threshold: usize, + /// Controls the amount of L0 included in a single compaction iteration. + /// The unit is `checkpoint_distance`, i.e., a size. + /// We add L0s to the set of layers to compact until their cumulative + /// size exceeds `compaction_upper_limit * checkpoint_distance`. + pub compaction_upper_limit: usize, pub compaction_algorithm: crate::models::CompactionAlgorithmSettings, /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure, /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer @@ -523,6 +528,12 @@ pub mod tenant_conf_defaults { pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; + + // This value needs to be tuned to avoid OOM. We have 3/4 of the total CPU threads to do background works, that's 16*3/4=9 on + // most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole + // calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB. + pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50; + pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = crate::models::CompactionAlgorithm::Legacy; @@ -563,6 +574,7 @@ impl Default for TenantConfigToml { compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) .expect("cannot parse default compaction period"), compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, + compaction_upper_limit: DEFAULT_COMPACTION_UPPER_LIMIT, compaction_algorithm: crate::models::CompactionAlgorithmSettings { kind: DEFAULT_COMPACTION_ALGORITHM, }, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 16f89ae13b..43447c67bd 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -458,6 +458,8 @@ pub struct TenantConfigPatch { pub compaction_period: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub compaction_threshold: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub compaction_upper_limit: FieldPatch, // defer parsing compaction_algorithm, like eviction_policy #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub compaction_algorithm: FieldPatch, @@ -522,6 +524,7 @@ pub struct TenantConfig { pub compaction_target_size: Option, pub compaction_period: Option, pub compaction_threshold: Option, + pub compaction_upper_limit: Option, // defer parsing compaction_algorithm, like eviction_policy pub compaction_algorithm: Option, pub l0_flush_delay_threshold: Option, @@ -559,6 +562,7 @@ impl TenantConfig { mut compaction_target_size, mut compaction_period, mut compaction_threshold, + mut compaction_upper_limit, mut compaction_algorithm, mut l0_flush_delay_threshold, mut l0_flush_stall_threshold, @@ -594,6 +598,9 @@ impl TenantConfig { .apply(&mut compaction_target_size); patch.compaction_period.apply(&mut compaction_period); patch.compaction_threshold.apply(&mut compaction_threshold); + patch + .compaction_upper_limit + .apply(&mut compaction_upper_limit); patch.compaction_algorithm.apply(&mut compaction_algorithm); patch .l0_flush_delay_threshold @@ -653,6 +660,7 @@ impl TenantConfig { compaction_target_size, compaction_period, compaction_threshold, + compaction_upper_limit, compaction_algorithm, l0_flush_delay_threshold, l0_flush_stall_threshold, diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index ee43440534..4b976e7f6f 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -984,6 +984,8 @@ components: type: string compaction_threshold: type: string + compaction_upper_limit: + type: string image_creation_threshold: type: integer walreceiver_connect_timeout: diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 4361fa3d66..085f76c05d 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3816,6 +3816,13 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + pub fn get_compaction_upper_limit(&self) -> usize { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .compaction_upper_limit + .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit) + } + pub fn get_gc_horizon(&self) -> u64 { let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf @@ -5469,6 +5476,7 @@ pub(crate) mod harness { compaction_target_size: Some(tenant_conf.compaction_target_size), compaction_period: Some(tenant_conf.compaction_period), compaction_threshold: Some(tenant_conf.compaction_threshold), + compaction_upper_limit: Some(tenant_conf.compaction_upper_limit), compaction_algorithm: Some(tenant_conf.compaction_algorithm), l0_flush_delay_threshold: tenant_conf.l0_flush_delay_threshold, l0_flush_stall_threshold: tenant_conf.l0_flush_stall_threshold, diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 50da998c30..139ed27bd2 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -277,6 +277,10 @@ pub struct TenantConfOpt { #[serde(default)] pub compaction_threshold: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub compaction_upper_limit: Option, + #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub compaction_algorithm: Option, @@ -401,6 +405,9 @@ impl TenantConfOpt { compaction_threshold: self .compaction_threshold .unwrap_or(global_conf.compaction_threshold), + compaction_upper_limit: self + .compaction_upper_limit + .unwrap_or(global_conf.compaction_upper_limit), compaction_algorithm: self .compaction_algorithm .as_ref() @@ -478,6 +485,7 @@ impl TenantConfOpt { mut compaction_target_size, mut compaction_period, mut compaction_threshold, + mut compaction_upper_limit, mut compaction_algorithm, mut l0_flush_delay_threshold, mut l0_flush_stall_threshold, @@ -519,6 +527,9 @@ impl TenantConfOpt { .map(|v| humantime::parse_duration(&v))? .apply(&mut compaction_period); patch.compaction_threshold.apply(&mut compaction_threshold); + patch + .compaction_upper_limit + .apply(&mut compaction_upper_limit); patch.compaction_algorithm.apply(&mut compaction_algorithm); patch .l0_flush_delay_threshold @@ -596,6 +607,7 @@ impl TenantConfOpt { compaction_target_size, compaction_period, compaction_threshold, + compaction_upper_limit, compaction_algorithm, l0_flush_delay_threshold, l0_flush_stall_threshold, @@ -657,6 +669,7 @@ impl From for models::TenantConfig { compaction_target_size: value.compaction_target_size, compaction_period: value.compaction_period.map(humantime), compaction_threshold: value.compaction_threshold, + compaction_upper_limit: value.compaction_upper_limit, l0_flush_delay_threshold: value.l0_flush_delay_threshold, l0_flush_stall_threshold: value.l0_flush_stall_threshold, l0_flush_wait_upload: value.l0_flush_wait_upload, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2033ebcdeb..f3cdad82d9 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2181,6 +2181,14 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + fn get_compaction_upper_limit(&self) -> usize { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .compaction_upper_limit + .unwrap_or(self.conf.default_tenant_conf.compaction_upper_limit) + } + fn get_l0_flush_delay_threshold(&self) -> Option { // Disable L0 flushes by default. This and compaction needs further tuning. const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3 diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index ad19738bc2..76dcc159ea 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -47,9 +47,7 @@ use crate::tenant::timeline::{ImageLayerCreationOutcome, IoConcurrency}; use crate::tenant::timeline::{Layer, ResidentLayer}; use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; -use pageserver_api::config::tenant_conf_defaults::{ - DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD, -}; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE; use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpace; @@ -1117,14 +1115,7 @@ impl Timeline { // Under normal circumstances, we will accumulate up to compaction_interval L0s of size // checkpoint_distance each. To avoid edge cases using extra system resources, bound our // work in this function to only operate on this much delta data at once. - // - // Take the max of the configured value & the default, so that tests that configure tiny values - // can still use a sensible amount of memory, but if a deployed system configures bigger values we - // still let them compact a full stack of L0s in one go. - let delta_size_limit = std::cmp::max( - self.get_compaction_threshold(), - DEFAULT_COMPACTION_THRESHOLD, - ) as u64 + let delta_size_limit = self.get_compaction_upper_limit() as u64 * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE); let mut fully_compacted = true; diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 8b92e4c442..e88d245c8f 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -139,6 +139,7 @@ def test_fully_custom_config(positive_env: NeonEnv): fully_custom_config = { "compaction_period": "1h", "compaction_threshold": 13, + "compaction_upper_limit": 100, "l0_flush_delay_threshold": 25, "l0_flush_stall_threshold": 42, "l0_flush_wait_upload": True,