From 8895f28dae229d84bf58d3660968b404a3f0c2e0 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 14 Apr 2023 12:25:45 +0200 Subject: [PATCH] make evictions_low_residence_duration_metric_threshold per-tenant (#3949) Before this patch, if a tenant would override its eviction_policy setting to use a lower LayerAccessThreshold::threshold than the `evictions_low_residence_duration_metric_threshold`, the evictions done for that tenant would count towards the `evictions_with_low_residence_duration` metric. That metric is used to identify pre-mature evictions, commonly triggered by disk-usage-based eviction under disk pressure. We don't want that to happen for the legitimate evictions of the tenant that overrides its eviction_policy. So, this patch - moves the setting into TenantConf - adds test coverage - updates the staging & prod yamls Forward Compatibility: Software before this patch will ignore the new tenant conf field and use the global one instead. So we can roll back safely. Backward Compatibility: Parsing old configs with software as of this patch will fail in `PageServerConf::parse_and_validate` with error `unrecognized pageserver option 'evictions_low_residence_duration_metric_threshold'` if the option is still present in the global section. We deal with this by updating the configs in Ansible. fixes https://github.com/neondatabase/neon/issues/3940 --- .../ansible/prod.ap-southeast-1.hosts.yaml | 2 +- .github/ansible/prod.eu-central-1.hosts.yaml | 2 +- .github/ansible/prod.us-east-2.hosts.yaml | 2 +- .github/ansible/prod.us-west-2.hosts.yaml | 8 +- .github/ansible/staging.eu-west-1.hosts.yaml | 2 +- .github/ansible/staging.us-east-2.hosts.yaml | 2 +- control_plane/src/pageserver.rs | 6 ++ libs/pageserver_api/src/models.rs | 3 + pageserver/src/config.rs | 40 ++------ pageserver/src/http/routes.rs | 26 +++++ pageserver/src/metrics.rs | 24 ++++- pageserver/src/tenant.rs | 10 ++ pageserver/src/tenant/config.rs | 16 ++++ pageserver/src/tenant/timeline.rs | 41 +++++++- test_runner/fixtures/pageserver/http.py | 7 ++ test_runner/regress/test_tenant_conf.py | 94 ++++++++++++++++++- 16 files changed, 239 insertions(+), 46 deletions(-) diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml index c185086eef..9c53733491 100644 --- a/.github/ansible/prod.ap-southeast-1.hosts.yaml +++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml @@ -17,7 +17,7 @@ storage: kind: "LayerAccessThreshold" period: "10m" threshold: &default_eviction_threshold "24h" - evictions_low_residence_duration_metric_threshold: *default_eviction_threshold + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml index 0a0f974ea4..3186519ca8 100644 --- a/.github/ansible/prod.eu-central-1.hosts.yaml +++ b/.github/ansible/prod.eu-central-1.hosts.yaml @@ -17,7 +17,7 @@ storage: kind: "LayerAccessThreshold" period: "10m" threshold: &default_eviction_threshold "24h" - evictions_low_residence_duration_metric_threshold: *default_eviction_threshold + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml index 4427bb344e..3062475b20 100644 --- a/.github/ansible/prod.us-east-2.hosts.yaml +++ b/.github/ansible/prod.us-east-2.hosts.yaml @@ -17,7 +17,7 @@ storage: kind: "LayerAccessThreshold" period: "10m" threshold: &default_eviction_threshold "24h" - evictions_low_residence_duration_metric_threshold: *default_eviction_threshold + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml index 53626b4f59..9cf847bcb1 100644 --- a/.github/ansible/prod.us-west-2.hosts.yaml +++ b/.github/ansible/prod.us-west-2.hosts.yaml @@ -17,7 +17,7 @@ storage: kind: "LayerAccessThreshold" period: "10m" threshold: &default_eviction_threshold "24h" - evictions_low_residence_duration_metric_threshold: *default_eviction_threshold + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" @@ -34,7 +34,7 @@ storage: pageservers: hosts: pageserver-0.us-west-2.aws.neon.tech: - ansible_host: i-0d9f6dfae0e1c780d + ansible_host: i-0d9f6dfae0e1c780d pageserver-1.us-west-2.aws.neon.tech: ansible_host: i-0c834be1dddba8b3f pageserver-2.us-west-2.aws.neon.tech: @@ -49,5 +49,5 @@ storage: safekeeper-1.us-west-2.aws.neon.tech: ansible_host: i-074682f9d3c712e7c safekeeper-2.us-west-2.aws.neon.tech: - ansible_host: i-042b7efb1729d7966 - + ansible_host: i-042b7efb1729d7966 + diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index 34c8e77280..39f5613935 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -17,7 +17,7 @@ storage: kind: "LayerAccessThreshold" period: "20m" threshold: &default_eviction_threshold "20m" - evictions_low_residence_duration_metric_threshold: *default_eviction_threshold + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index 94f2be83a4..e63ed6e639 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -17,7 +17,7 @@ storage: kind: "LayerAccessThreshold" period: "20m" threshold: &default_eviction_threshold "20m" - evictions_low_residence_duration_metric_threshold: *default_eviction_threshold + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 094069e4c0..b700d426ba 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -368,6 +368,9 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'min_resident_size_override' as integer")?, + evictions_low_residence_duration_metric_threshold: settings + .remove("evictions_low_residence_duration_metric_threshold") + .map(|x| x.to_string()), }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -445,6 +448,9 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'min_resident_size_override' as an integer")?, + evictions_low_residence_duration_metric_threshold: settings + .get("evictions_low_residence_duration_metric_threshold") + .map(|x| x.to_string()), }) .send()? .error_from_body()?; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index a351761f4a..15c37b9453 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -135,6 +135,7 @@ pub struct TenantCreateRequest { // For now, this field is not even documented in the openapi_spec.yml. pub eviction_policy: Option, pub min_resident_size_override: Option, + pub evictions_low_residence_duration_metric_threshold: Option, } #[serde_as] @@ -181,6 +182,7 @@ pub struct TenantConfigRequest { // For now, this field is not even documented in the openapi_spec.yml. pub eviction_policy: Option, pub min_resident_size_override: Option, + pub evictions_low_residence_duration_metric_threshold: Option, } impl TenantConfigRequest { @@ -202,6 +204,7 @@ impl TenantConfigRequest { trace_read_requests: None, eviction_policy: None, min_resident_size_override: None, + evictions_low_residence_duration_metric_threshold: None, } } } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 19f0f22815..826cf1aab3 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -62,7 +62,6 @@ pub mod defaults { pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour"; pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; - pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; /// /// Default built-in configuration file. @@ -91,7 +90,6 @@ pub mod defaults { #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}' #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}' -#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} @@ -108,6 +106,7 @@ pub mod defaults { #pitr_interval = '{DEFAULT_PITR_INTERVAL}' #min_resident_size_override = .. # in bytes +#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' # [remote_storage] @@ -182,9 +181,6 @@ pub struct PageServerConf { pub metric_collection_endpoint: Option, pub synthetic_size_calculation_interval: Duration, - // See the corresponding metric's help string. - pub evictions_low_residence_duration_metric_threshold: Duration, - pub disk_usage_based_eviction: Option, pub test_remote_failures: u64, @@ -257,8 +253,6 @@ struct PageServerConfigBuilder { metric_collection_endpoint: BuilderValue>, synthetic_size_calculation_interval: BuilderValue, - evictions_low_residence_duration_metric_threshold: BuilderValue, - disk_usage_based_eviction: BuilderValue>, test_remote_failures: BuilderValue, @@ -316,11 +310,6 @@ impl Default for PageServerConfigBuilder { .expect("cannot parse default synthetic size calculation interval")), metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT), - evictions_low_residence_duration_metric_threshold: Set(humantime::parse_duration( - DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, - ) - .expect("cannot parse DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD")), - disk_usage_based_eviction: Set(None), test_remote_failures: Set(0), @@ -438,10 +427,6 @@ impl PageServerConfigBuilder { self.test_remote_failures = BuilderValue::Set(fail_first); } - pub fn evictions_low_residence_duration_metric_threshold(&mut self, value: Duration) { - self.evictions_low_residence_duration_metric_threshold = BuilderValue::Set(value); - } - pub fn disk_usage_based_eviction(&mut self, value: Option) { self.disk_usage_based_eviction = BuilderValue::Set(value); } @@ -525,11 +510,6 @@ impl PageServerConfigBuilder { synthetic_size_calculation_interval: self .synthetic_size_calculation_interval .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?, - evictions_low_residence_duration_metric_threshold: self - .evictions_low_residence_duration_metric_threshold - .ok_or(anyhow!( - "missing evictions_low_residence_duration_metric_threshold" - ))?, disk_usage_based_eviction: self .disk_usage_based_eviction .ok_or(anyhow!("missing disk_usage_based_eviction"))?, @@ -721,7 +701,6 @@ impl PageServerConf { "synthetic_size_calculation_interval" => builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), - "evictions_low_residence_duration_metric_threshold" => builder.evictions_low_residence_duration_metric_threshold(parse_toml_duration(key, item)?), "disk_usage_based_eviction" => { tracing::info!("disk_usage_based_eviction: {:#?}", &item); builder.disk_usage_based_eviction( @@ -839,6 +818,13 @@ impl PageServerConf { ); } + if let Some(item) = item.get("evictions_low_residence_duration_metric_threshold") { + t_conf.evictions_low_residence_duration_metric_threshold = Some(parse_toml_duration( + "evictions_low_residence_duration_metric_threshold", + item, + )?); + } + Ok(t_conf) } @@ -877,10 +863,6 @@ impl PageServerConf { cached_metric_collection_interval: Duration::from_secs(60 * 60), metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, synthetic_size_calculation_interval: Duration::from_secs(60), - evictions_low_residence_duration_metric_threshold: humantime::parse_duration( - defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, - ) - .unwrap(), disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, @@ -1029,8 +1011,6 @@ cached_metric_collection_interval = '22200 s' metric_collection_endpoint = 'http://localhost:80/metrics' synthetic_size_calculation_interval = '333 s' -evictions_low_residence_duration_metric_threshold = '444 s' - log_format = 'json' "#; @@ -1087,9 +1067,6 @@ log_format = 'json' synthetic_size_calculation_interval: humantime::parse_duration( defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL )?, - evictions_low_residence_duration_metric_threshold: humantime::parse_duration( - defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD - )?, disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, @@ -1144,7 +1121,6 @@ log_format = 'json' cached_metric_collection_interval: Duration::from_secs(22200), metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), synthetic_size_calculation_interval: Duration::from_secs(333), - evictions_low_residence_duration_metric_threshold: Duration::from_secs(444), disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index e7a86e4822..06a97f6dff 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -781,6 +781,19 @@ async fn tenant_create_handler(mut request: Request) -> Result, } impl TimelineMetrics { @@ -656,7 +672,9 @@ impl TimelineMetrics { num_persistent_files_created, persistent_bytes_written, evictions, - evictions_with_low_residence_duration, + evictions_with_low_residence_duration: std::sync::RwLock::new( + evictions_with_low_residence_duration, + ), } } } @@ -675,6 +693,8 @@ impl Drop for TimelineMetrics { let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]); self.evictions_with_low_residence_duration + .write() + .unwrap() .remove(tenant_id, timeline_id); for op in STORAGE_TIME_OPERATIONS { let _ = diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d98aa5c566..18a4d7617b 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1735,6 +1735,13 @@ impl Tenant { pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { *self.tenant_conf.write().unwrap() = new_tenant_conf; + // Don't hold self.timelines.lock() during the notifies. + // There's no risk of deadlock right now, but there could be if we consolidate + // mutexes in struct Timeline in the future. + let timelines = self.list_timelines(); + for timeline in timelines { + timeline.tenant_conf_updated(); + } } fn create_timeline_data( @@ -2815,6 +2822,9 @@ pub mod harness { trace_read_requests: Some(tenant_conf.trace_read_requests), eviction_policy: Some(tenant_conf.eviction_policy), min_resident_size_override: tenant_conf.min_resident_size_override, + evictions_low_residence_duration_metric_threshold: Some( + tenant_conf.evictions_low_residence_duration_metric_threshold, + ), } } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index cdabb23a7b..c01a8aa8c0 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -39,6 +39,7 @@ pub mod defaults { pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds"; pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; + pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; } /// Per-tenant configuration options @@ -93,6 +94,9 @@ pub struct TenantConf { pub trace_read_requests: bool, pub eviction_policy: EvictionPolicy, pub min_resident_size_override: Option, + // See the corresponding metric's help string. + #[serde(with = "humantime_serde")] + pub evictions_low_residence_duration_metric_threshold: Duration, } /// Same as TenantConf, but this struct preserves the information about @@ -164,6 +168,11 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub min_resident_size_override: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(with = "humantime_serde")] + #[serde(default)] + pub evictions_low_residence_duration_metric_threshold: Option, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -228,6 +237,9 @@ impl TenantConfOpt { min_resident_size_override: self .min_resident_size_override .or(global_conf.min_resident_size_override), + evictions_low_residence_duration_metric_threshold: self + .evictions_low_residence_duration_metric_threshold + .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold), } } } @@ -260,6 +272,10 @@ impl Default for TenantConf { trace_read_requests: false, eviction_policy: EvictionPolicy::NoEviction, min_resident_size_override: None, + evictions_low_residence_duration_metric_threshold: humantime::parse_duration( + DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, + ) + .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), } } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 29d8b544cc..b8b1f963e5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -77,6 +77,7 @@ pub(super) use self::eviction_task::EvictionTaskTenantState; use self::eviction_task::EvictionTaskTimelineState; use self::walreceiver::{WalReceiver, WalReceiverConf}; +use super::config::TenantConf; use super::layer_map::BatchedUpdates; use super::remote_timeline_client::index::IndexPart; use super::remote_timeline_client::RemoteTimelineClient; @@ -161,7 +162,7 @@ pub struct Timeline { ancestor_timeline: Option>, ancestor_lsn: Lsn, - metrics: TimelineMetrics, + pub(super) metrics: TimelineMetrics, /// Ensures layers aren't frozen by checkpointer between /// [`Timeline::get_layer_for_write`] and layer reads. @@ -1136,6 +1137,8 @@ impl Timeline { if let Some(delta) = local_layer_residence_duration { self.metrics .evictions_with_low_residence_duration + .read() + .unwrap() .observe(delta); info!(layer=%local_layer.short_id(), residence_millis=delta.as_millis(), "evicted layer after known residence period"); } else { @@ -1209,6 +1212,35 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.eviction_policy) } + fn get_evictions_low_residence_duration_metric_threshold( + tenant_conf: &TenantConfOpt, + default_tenant_conf: &TenantConf, + ) -> Duration { + tenant_conf + .evictions_low_residence_duration_metric_threshold + .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold) + } + + pub(super) fn tenant_conf_updated(&self) { + // NB: Most tenant conf options are read by background loops, so, + // changes will automatically be picked up. + + // The threshold is embedded in the metric. So, we need to update it. + { + let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold( + &self.tenant_conf.read().unwrap(), + &self.conf.default_tenant_conf, + ); + let tenant_id_str = self.tenant_id.to_string(); + let timeline_id_str = self.timeline_id.to_string(); + self.metrics + .evictions_with_low_residence_duration + .write() + .unwrap() + .change_threshold(&tenant_id_str, &timeline_id_str, new_threshold); + } + } + /// Open a Timeline handle. /// /// Loads the metadata for the timeline into memory, but not the layer map. @@ -1240,6 +1272,11 @@ impl Timeline { let max_lsn_wal_lag = tenant_conf_guard .max_lsn_wal_lag .unwrap_or(conf.default_tenant_conf.max_lsn_wal_lag); + let evictions_low_residence_duration_metric_threshold = + Self::get_evictions_low_residence_duration_metric_threshold( + &tenant_conf_guard, + &conf.default_tenant_conf, + ); drop(tenant_conf_guard); Arc::new_cyclic(|myself| { @@ -1287,7 +1324,7 @@ impl Timeline { &timeline_id, crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( "mtime", - conf.evictions_low_residence_duration_metric_threshold, + evictions_low_residence_duration_metric_threshold, ), ), diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 1e1effe295..69042478c7 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -519,6 +519,13 @@ class PageserverHttpClient(requests.Session): assert res.status_code == 200 + def download_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId): + info = self.layer_map_info(tenant_id, timeline_id) + for layer in info.historic_layers: + if not layer.remote: + continue + self.download_layer(tenant_id, timeline_id, layer.layer_file_name) + def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str): res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 28f1a960df..1ed86d19a2 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -18,7 +18,11 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = """ page_cache_size=444; wait_lsn_timeout='111 s'; -tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" +[tenant_config] +checkpoint_distance = 10000 +compaction_target_size = 1048576 +evictions_low_residence_duration_metric_threshold = "2 days" +""" env = neon_env_builder.init_start() http_client = env.pageserver.http_client() @@ -39,6 +43,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" new_conf = { "checkpoint_distance": "20000", "gc_period": "30sec", + "evictions_low_residence_duration_metric_threshold": "42s", } tenant, _ = env.neon_cli.create_tenant(conf=new_conf) @@ -78,6 +83,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" assert effective_config["gc_period"] == "1h" assert effective_config["image_creation_threshold"] == 3 assert effective_config["pitr_interval"] == "7days" + assert effective_config["evictions_low_residence_duration_metric_threshold"] == "2days" # check the configuration of the new tenant with closing(env.pageserver.connect()) as psconn: @@ -112,6 +118,9 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" assert ( new_effective_config["gc_period"] == "30s" ), "Specific 'gc_period' config should override the default value" + assert ( + new_effective_config["evictions_low_residence_duration_metric_threshold"] == "42s" + ), "Should override default value" assert new_effective_config["compaction_target_size"] == 1048576 assert new_effective_config["compaction_period"] == "20s" assert new_effective_config["compaction_threshold"] == 10 @@ -125,6 +134,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "gc_period": "80sec", "compaction_period": "80sec", "image_creation_threshold": "2", + "evictions_low_residence_duration_metric_threshold": "23h", } env.neon_cli.config_tenant( tenant_id=tenant, @@ -167,6 +177,9 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" assert ( updated_effective_config["compaction_period"] == "1m 20s" ), "Specific 'compaction_period' config should override the default value" + assert ( + updated_effective_config["evictions_low_residence_duration_metric_threshold"] == "23h" + ), "Should override default value" assert updated_effective_config["compaction_target_size"] == 1048576 assert updated_effective_config["compaction_threshold"] == 10 assert updated_effective_config["gc_horizon"] == 67108864 @@ -225,6 +238,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" assert final_effective_config["gc_horizon"] == 67108864 assert final_effective_config["gc_period"] == "1h" assert final_effective_config["image_creation_threshold"] == 3 + assert final_effective_config["evictions_low_residence_duration_metric_threshold"] == "2days" # restart the pageserver and ensure that the config is still correct env.pageserver.stop() @@ -285,3 +299,81 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): # dont test applying the setting here, we have that another test case to show it # we just care about being able to create the file assert len(contents_first) > len(contents_later) + + +def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold( + neon_env_builder: NeonEnvBuilder, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=RemoteStorageKind.LOCAL_FS, + test_name="test_live_reconfig_get_evictions_low_residence_duration_metric_threshold", + ) + + env = neon_env_builder.init_start() + assert isinstance(env.remote_storage, LocalFsStorage) + + (tenant_id, timeline_id) = env.neon_cli.create_tenant() + ps_http = env.pageserver.http_client() + + def get_metric(): + metrics = ps_http.get_metrics() + metric = metrics.query_one( + "pageserver_evictions_with_low_residence_duration_total", + { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + }, + ) + return metric + + default_value = ps_http.tenant_config(tenant_id).effective_config[ + "evictions_low_residence_duration_metric_threshold" + ] + metric = get_metric() + assert int(metric.value) == 0, "metric is present with default value" + + assert default_value == "1day" + + ps_http.download_all_layers(tenant_id, timeline_id) + ps_http.evict_all_layers(tenant_id, timeline_id) + metric = get_metric() + assert int(metric.value) > 0, "metric is updated" + + env.neon_cli.config_tenant( + tenant_id, {"evictions_low_residence_duration_metric_threshold": default_value} + ) + updated_metric = get_metric() + assert int(updated_metric.value) == int( + metric.value + ), "metric is unchanged when setting same value" + + env.neon_cli.config_tenant( + tenant_id, {"evictions_low_residence_duration_metric_threshold": "2day"} + ) + metric = get_metric() + assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60 + assert int(metric.value) == 0 + + ps_http.download_all_layers(tenant_id, timeline_id) + ps_http.evict_all_layers(tenant_id, timeline_id) + metric = get_metric() + assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60 + assert int(metric.value) > 0 + + env.neon_cli.config_tenant( + tenant_id, {"evictions_low_residence_duration_metric_threshold": "2h"} + ) + metric = get_metric() + assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60 + assert int(metric.value) == 0, "value resets if label changes" + + ps_http.download_all_layers(tenant_id, timeline_id) + ps_http.evict_all_layers(tenant_id, timeline_id) + metric = get_metric() + assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60 + assert int(metric.value) > 0, "set a non-zero value for next step" + + env.neon_cli.config_tenant(tenant_id, {}) + metric = get_metric() + assert int(metric.labels["low_threshold_secs"]) == 24 * 60 * 60, "label resets to default" + assert int(metric.value) == 0, "value resets to default"