diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 16f35355af..39282ce320 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -61,6 +61,7 @@ pub mod defaults { pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour"; pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; + pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; /// /// Default built-in configuration file. @@ -89,6 +90,8 @@ pub mod defaults { #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}' #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}' +#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' + # [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -170,6 +173,9 @@ pub struct PageServerConf { pub metric_collection_endpoint: Option, pub synthetic_size_calculation_interval: Duration, + // See the corresponding metric's help string. + pub evictions_low_residence_duration_metric_threshold: Duration, + pub test_remote_failures: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, @@ -240,6 +246,8 @@ struct PageServerConfigBuilder { metric_collection_endpoint: BuilderValue>, synthetic_size_calculation_interval: BuilderValue, + evictions_low_residence_duration_metric_threshold: BuilderValue, + test_remote_failures: BuilderValue, ondemand_download_behavior_treat_error_as_warn: BuilderValue, @@ -293,6 +301,11 @@ impl Default for PageServerConfigBuilder { .expect("cannot parse default synthetic size calculation interval")), metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT), + evictions_low_residence_duration_metric_threshold: Set(humantime::parse_duration( + DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, + ) + .expect("cannot parse DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD")), + test_remote_failures: Set(0), ondemand_download_behavior_treat_error_as_warn: Set(false), @@ -408,6 +421,10 @@ impl PageServerConfigBuilder { self.test_remote_failures = BuilderValue::Set(fail_first); } + pub fn evictions_low_residence_duration_metric_threshold(&mut self, value: Duration) { + self.evictions_low_residence_duration_metric_threshold = BuilderValue::Set(value); + } + pub fn ondemand_download_behavior_treat_error_as_warn( &mut self, ondemand_download_behavior_treat_error_as_warn: bool, @@ -481,6 +498,11 @@ impl PageServerConfigBuilder { synthetic_size_calculation_interval: self .synthetic_size_calculation_interval .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?, + evictions_low_residence_duration_metric_threshold: self + .evictions_low_residence_duration_metric_threshold + .ok_or(anyhow!( + "missing evictions_low_residence_duration_metric_threshold" + ))?, test_remote_failures: self .test_remote_failures .ok_or(anyhow!("missing test_remote_failuers"))?, @@ -670,6 +692,7 @@ impl PageServerConf { "synthetic_size_calculation_interval" => builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), + "evictions_low_residence_duration_metric_threshold" => builder.evictions_low_residence_duration_metric_threshold(parse_toml_duration(key, item)?), "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), } @@ -810,6 +833,10 @@ impl PageServerConf { cached_metric_collection_interval: Duration::from_secs(60 * 60), metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, synthetic_size_calculation_interval: Duration::from_secs(60), + evictions_low_residence_duration_metric_threshold: humantime::parse_duration( + defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, + ) + .unwrap(), test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, } @@ -951,6 +978,9 @@ metric_collection_interval = '222 s' cached_metric_collection_interval = '22200 s' metric_collection_endpoint = 'http://localhost:80/metrics' synthetic_size_calculation_interval = '333 s' + +evictions_low_residence_duration_metric_threshold = '444 s' + log_format = 'json' "#; @@ -1005,6 +1035,9 @@ log_format = 'json' synthetic_size_calculation_interval: humantime::parse_duration( defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL )?, + evictions_low_residence_duration_metric_threshold: humantime::parse_duration( + defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD + )?, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, }, @@ -1056,6 +1089,7 @@ log_format = 'json' cached_metric_collection_interval: Duration::from_secs(22200), metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), synthetic_size_calculation_interval: Duration::from_secs(333), + evictions_low_residence_duration_metric_threshold: Duration::from_secs(444), test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, }, diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 6a8aecfd25..b5563ad186 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -194,6 +194,93 @@ static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static EVICTIONS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_evictions", + "Number of layers evicted from the pageserver", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_evictions_with_low_residence_duration", + "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \ + Residence duration is determined using the `residence_duration_data_source`.", + &["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"] + ) + .expect("failed to define a metric") +}); + +/// Each [`Timeline`]'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. +#[derive(Debug)] +pub struct EvictionsWithLowResidenceDuration { + data_source: &'static str, + threshold: Duration, + counter: Option, +} + +pub struct EvictionsWithLowResidenceDurationBuilder { + data_source: &'static str, + threshold: Duration, +} + +impl EvictionsWithLowResidenceDurationBuilder { + pub fn new(data_source: &'static str, threshold: Duration) -> Self { + Self { + data_source, + threshold, + } + } + + fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration { + let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION + .get_metric_with_label_values(&[ + tenant_id, + timeline_id, + self.data_source, + &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold), + ]) + .unwrap(); + EvictionsWithLowResidenceDuration { + data_source: self.data_source, + threshold: self.threshold, + counter: Some(counter), + } + } +} + +impl EvictionsWithLowResidenceDuration { + fn threshold_label_value(threshold: Duration) -> String { + format!("{}", threshold.as_secs()) + } + + pub fn observe(&self, observed_value: Duration) { + if self.threshold < observed_value { + self.counter + .as_ref() + .expect("nobody calls this function after `remove_from_vec`") + .inc(); + } + } + + // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`. + fn remove(&mut self, tenant_id: &str, timeline_id: &str) { + let Some(_counter) = self.counter.take() else { + return; + }; + EVICTIONS_WITH_LOW_RESIDENCE_DURATION + .remove_label_values(&[ + tenant_id, + timeline_id, + self.data_source, + &Self::threshold_label_value(self.threshold), + ]) + .expect("we own the metric, no-one else should remove it"); + } +} + // Metrics collected on disk IO operations // // Roughly logarithmic scale. @@ -510,10 +597,16 @@ pub struct TimelineMetrics { pub current_logical_size_gauge: UIntGauge, pub num_persistent_files_created: IntCounter, pub persistent_bytes_written: IntCounter, + pub evictions: IntCounter, + pub evictions_with_low_residence_duration: EvictionsWithLowResidenceDuration, } impl TimelineMetrics { - pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { + pub fn new( + tenant_id: &TenantId, + timeline_id: &TimelineId, + evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder, + ) -> Self { let tenant_id = tenant_id.to_string(); let timeline_id = timeline_id.to_string(); let reconstruct_time_histo = RECONSTRUCT_TIME @@ -550,6 +643,11 @@ impl TimelineMetrics { let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); + let evictions = EVICTIONS + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let evictions_with_low_residence_duration = + evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id); TimelineMetrics { tenant_id, @@ -569,6 +667,8 @@ impl TimelineMetrics { current_logical_size_gauge, num_persistent_files_created, persistent_bytes_written, + evictions, + evictions_with_low_residence_duration, } } } @@ -585,7 +685,9 @@ impl Drop for TimelineMetrics { let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); - + let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]); + self.evictions_with_low_residence_duration + .remove(tenant_id, timeline_id); for op in STORAGE_TIME_OPERATIONS { let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]); @@ -620,7 +722,7 @@ use std::collections::HashMap; use std::pin::Pin; use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; -use std::time::Instant; +use std::time::{Duration, Instant}; pub struct RemoteTimelineClientMetrics { tenant_id: String, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 0c42ed3079..f5dbe63b0b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1051,6 +1051,22 @@ impl Timeline { .file_size() .expect("Local layer should have a file size"); + let local_layer_mtime = local_layer + .local_path() + .expect("local layer should have a local path") + .metadata() + .context("get local layer file stat")? + .modified() + .context("get mtime of layer file")?; + let local_layer_residence_duration = + match SystemTime::now().duration_since(local_layer_mtime) { + Err(e) => { + warn!("layer mtime is in the future: {}", e); + None + } + Ok(delta) => Some(delta), + }; + let layer_metadata = LayerFileMetadata::new(layer_file_size); let new_remote_layer = Arc::new(match local_layer.filename() { @@ -1093,6 +1109,14 @@ impl Timeline { .resident_physical_size_gauge .sub(layer_file_size); + self.metrics.evictions.inc(); + + if let Some(delta) = local_layer_residence_duration { + self.metrics + .evictions_with_low_residence_duration + .observe(delta); + } + true } Replacement::NotFound => { @@ -1208,7 +1232,14 @@ impl Timeline { ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), - metrics: TimelineMetrics::new(&tenant_id, &timeline_id), + metrics: TimelineMetrics::new( + &tenant_id, + &timeline_id, + crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( + "mtime", + conf.evictions_low_residence_duration_metric_threshold, + ), + ), flush_loop_state: Mutex::new(FlushLoopState::NotStarted), diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 450c02735a..2984f2c7d3 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -78,5 +78,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_created_persistent_files_total", "pageserver_written_persistent_bytes_total", "pageserver_tenant_states_count", + "pageserver_evictions_total", + "pageserver_evictions_with_low_residence_duration_total", *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, )